Spaces:

akashkj
/

H2OGPT

Runtime error

App Files Files Community

akashkj commited on Jul 7, 2023

Commit

3f7cfab

•

1 Parent(s): 947e63a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env_gpt4all +17 -0
.gitattributes +26 -0
.github/workflows/snyk-scan.yml +76 -0
.gitignore +37 -0
Dockerfile +20 -0
Dockerfile-runner.in +15 -0
LICENSE +201 -0
Makefile +63 -0
README.md +209 -7
__pycache__/enums.cpython-310.pyc +0 -0
__pycache__/generate.cpython-310.pyc +0 -0
__pycache__/gpt4all_llm.cpython-310.pyc +0 -0
__pycache__/gpt_langchain.cpython-310.pyc +0 -0
__pycache__/gradio_runner.cpython-310.pyc +0 -0
__pycache__/gradio_themes.cpython-310.pyc +0 -0
__pycache__/h2oai_pipeline.cpython-310.pyc +0 -0
__pycache__/loaders.cpython-310.pyc +0 -0
__pycache__/prompter.cpython-310.pyc +0 -0
__pycache__/stopping.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
__pycache__/utils_langchain.cpython-310.pyc +0 -0
blog/README.md +81 -0
ci/jenkinsfile +158 -0
cli.py +108 -0
client/.gitignore +164 -0
client/Makefile +25 -0
client/README.md +41 -0
client/h2ogpt_client/__init__.py +4 -0
client/h2ogpt_client/core.py +314 -0
client/poetry.lock +876 -0
client/poetry.toml +1 -0
client/pyproject.toml +39 -0
client/tests/__init__.py +0 -0
client/tests/test_client.py +55 -0
client_test.py +337 -0
create_data.py +1809 -0
data/NGSL_1.2_stats.csv.zip +3 -0
data/README-template.md +23 -0
data/censor_words.txt +10 -0
data/config.json +0 -0
data/count_1w.txt.zip +3 -0
data/create_data_cards.py +144 -0
data/dai_docs.train.json +0 -0
data/dai_docs.train_cleaned.json +0 -0
data/dai_docs.valid.json +101 -0
data/dai_faq.json +477 -0
data/h2ogpt-personality.json +642 -0
data/merged.json +0 -0
data/pexels-evg-kowalievska-1170986_small.jpg +0 -0
docker-compose.yml +28 -0

.env_gpt4all ADDED Viewed

	@@ -0,0 +1,17 @@

+# GPT4ALL or llama-cpp-python model_kwargs
+# GPT4ALl GPT-J type, from model explorer choice, so downloads
+model_name_gptj=ggml-gpt4all-j-v1.3-groovy.bin
+# llama-cpp-python type, supporting version 3 quantization, here from locally built llama.cpp q4 v3 quantization
+# below uses prompt_type=wizard2
+model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin
+# below assumes max_new_tokens=256
+n_ctx=1792
+# uncomment below if using llama-cpp-pyton with cublas built in
+# n_gpu_layers=20
+# GPT4ALl LLaMa type, supporting version 2 quantization, here from model explorer choice so downloads
+model_name_gpt4all_llama=ggml-wizardLM-7B.q4_2.bin
+# PDF_CLASS_NAME=UnstructuredPDFLoader

.gitattributes CHANGED Viewed

@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+offline_folder/embed_out.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.27.attention.bias.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.27.attention.dense.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.27.attention.query_key_value.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.27.mlp.dense_4h_to_h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.27.mlp.dense_h_to_4h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.28.attention.bias.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.28.attention.dense.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.28.attention.query_key_value.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.28.mlp.dense_4h_to_h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.28.mlp.dense_h_to_4h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.29.attention.bias.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.29.attention.dense.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.29.attention.query_key_value.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.29.mlp.dense_4h_to_h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.29.mlp.dense_h_to_4h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.30.attention.bias.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.30.attention.dense.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.30.attention.query_key_value.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.30.mlp.dense_4h_to_h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.30.mlp.dense_h_to_4h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.31.attention.bias.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.31.attention.dense.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.31.attention.query_key_value.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.31.mlp.dense_4h_to_h.weight.dat filter=lfs diff=lfs merge=lfs -text
+offline_folder/gpt_neox.layers.31.mlp.dense_h_to_4h.weight.dat filter=lfs diff=lfs merge=lfs -text

.github/workflows/snyk-scan.yml ADDED Viewed

	@@ -0,0 +1,76 @@

+name: Snyk Security Vulnerability Scan
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    tags:
+      - 'v[0-9]+.[0-9]+.[0-9]+'
+    branches:
+      - main
+jobs:
+  snyk_scan_test:
+    if: ${{ github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@master
+      - uses: snyk/actions/setup@master
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Check changed Deps files
+        uses: tj-actions/changed-files@v35
+        id: changed-files
+        with:
+          files: | # This will match all the files with below patterns
+            requirements.txt
+      - name: Scan python dependencies
+        if: contains(steps.changed-files.outputs.all_changed_and_modified_files, 'requirements.txt')
+        env:
+          SNYK_TOKEN: '${{ secrets.SNYK_TOKEN }}'
+        run: |
+          head -n 41 requirements.txt > temp-requirements.txt #remove test deps
+          python3.10 -m pip install -r temp-requirements.txt
+          snyk test \
+            -d \
+            --file=temp-requirements.txt \
+            --package-manager=pip \
+            --command=python3.10 \
+            --skip-unresolved \
+            --severity-threshold=high
+  snyk_scan_monitor:
+    if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch'}}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@master
+      - uses: snyk/actions/setup@master
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Extract github branch/tag name
+        shell: bash
+        run: echo "ref=$(echo ${GITHUB_REF##*/})" >> $GITHUB_OUTPUT
+        id: extract_ref
+      - name: Monitor python dependencies
+        env:
+          SNYK_TOKEN: '${{ secrets.SNYK_TOKEN }}'
+        run: |
+          head -n 41 requirements.txt > temp-requirements.txt #remove test deps
+          python3.10 -m pip install -r temp-requirements.txt
+          snyk monitor \
+            -d \
+            --file=temp-requirements.txt \
+            --command=python3.10 \
+            --package-manager=pip \
+            --skip-unresolved \
+            --remote-repo-url=h2ogpt/${{ steps.extract_ref.outputs.ref }} \
+            --org=h2o-gpt \
+            --project-name=H2O-GPT/h2ogpt/${{ steps.extract_ref.outputs.ref }}/requirements.txt

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+out/
+7B/
+13B/
+__pycache__/
+checkpoint**
+minimal-llama**
+upload.py
+lora-**
+*ckpt
+wandb
+evaluate.py
+test_data.json
+todo.txt
+.neptune/
+*.bin
+db_dir_UserData
+temp_path_do_doc1
+offline_folder
+flagged_data_points
+.pytest_cache
+user_path
+user_path_test
+build
+h2ogpt.egg-info
+dist
+.idea
+.cache
+.local
+.bash_history
+.benchmarks
+Dockerfile-runner.dockerfile
+# IDEs
+.idea/
+# virtual envs
+venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# devel needed for bitsandbytes requirement of libcudart.so, otherwise runtime sufficient
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    wget \
+    software-properties-common \
+    pandoc \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt install -y python3.10 python3-dev libpython3.10-dev \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /workspace
+COPY requirements.txt requirements.txt
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+RUN python3.10 -m pip install -r requirements.txt
+COPY . .
+ENTRYPOINT [ "python3.10"]

Dockerfile-runner.in ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM BASE_DOCKER_IMAGE_SUBST
+LABEL imagetype="runtime-h2ogpt"
+LABEL maintainer="H2O.ai <ops@h2o.ai>"
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TRANSFORMERS_CACHE=/h2ogpt_env/.cache
+ENV HF_MODEL=h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3
+COPY run-gpt.sh /run-gpt.sh
+EXPOSE 8888
+EXPOSE 7860
+ENTRYPOINT ["/run-gpt.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,63 @@

+all: clean dist
+PACKAGE_VERSION       := `cat version.txt | tr -d '\n'`
+BUILD_TAG_FILES       := requirements.txt Dockerfile `ls reqs_optional/*.txt | sort`
+BUILD_TAG             := $(shell md5sum $(BUILD_TAG_FILES) 2> /dev/null | sort | md5sum | cut -d' ' -f1)
+DOCKER_TEST_IMAGE     := harbor.h2o.ai/h2ogpt/test-image:$(BUILD_TAG)
+DOCKER_RUN_IMAGE      := $(DOCKER_TEST_IMAGE)-runtime
+PYTHON_BINARY         ?= `which python`
+DEFAULT_MARKERS       ?= "not need_tokens and not need_gpu"
+.PHONY: reqs_optional/req_constraints.txt venv dist test publish docker_build
+reqs_optional/req_constraints.txt:
+	grep -v '#\|peft\|transformers\|accelerate' requirements.txt > $@
+clean:
+	rm -rf dist build h2ogpt.egg-info
+venv:
+	$(PYTHON_BINARY) -m virtualenv -p $(PYTHON_BINARY) venv
+install:
+	$(PYTHON_BINARY) -m pip install dist/h2ogpt-$(PACKAGE_VERSION)-py3-none-any.whl
+install-%:
+	$(PYTHON_BINARY) -m pip install dist/h2ogpt-$(PACKAGE_VERSION)-py3-none-any.whl[$*]
+dist:
+	$(PYTHON_BINARY) setup.py bdist_wheel
+test:
+	$(PYTHON_BINARY) -m pip install requirements-parser -c reqs_optional/req_constraints.txt
+	$(PYTHON_BINARY) -m pytest tests --disable-warnings --junit-xml=test_report.xml -m "$(DEFAULT_MARKERS)"
+test_imports:
+	$(PYTHON_BINARY) -m pytest tests/test_imports.py --disable-warnings --junit-xml=test_report.xml -m "$(DEFAULT_MARKERS)"
+publish:
+	echo "Publishing not implemented yet."
+docker_build:
+ifeq ($(shell curl --write-out %{http_code} -sS --output /dev/null -X GET http://harbor.h2o.ai/api/v2.0/projects/h2ogpt/repositories/test-image/artifacts/$(BUILD_TAG)/tags),200)
+	@echo "Image already pushed to Harbor: $(DOCKER_TEST_IMAGE)"
+else
+	DOCKER_BUILDKIT=1 docker build -t $(DOCKER_TEST_IMAGE) -f Dockerfile .
+	docker push $(DOCKER_TEST_IMAGE)
+endif
+.PHONY: Dockerfile-runner.dockerfile
+Dockerfile-runner.dockerfile: Dockerfile-runner.in
+	cat $< \
+	| sed 's|BASE_DOCKER_IMAGE_SUBST|$(DOCKER_TEST_IMAGE)|g' \
+	> $@
+docker_build_runner: docker_build Dockerfile-runner.dockerfile
+	docker pull $(DOCKER_TEST_IMAGE)
+	DOCKER_BUILDKIT=1 docker build -t $(DOCKER_RUN_IMAGE) -f Dockerfile-runner.dockerfile .
+	docker push $(DOCKER_RUN_IMAGE)
+	docker tag $(DOCKER_RUN_IMAGE) gcr.io/vorvan/h2oai/h2ogpt-runtime:$(BUILD_TAG)
+print-%:
+	@echo $($*)

README.md CHANGED Viewed

@@ -1,12 +1,214 @@
 ---
 title: H2OGPT
-emoji: 🏢
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 3.36.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: H2OGPT
+app_file: gradio_runner.py
 sdk: gradio
+sdk_version: 3.35.2
 ---
+## h2oGPT
+[![img-small.png](docs/img-small.png) Live h2oGPT Demo](https://gpt.h2o.ai/)
+For questions, discussing, or just hanging out, come and join our <a href="https://discord.gg/WKhYMWcVbq"><b>Discord</b></a>!
+Technical Paper: [https://arxiv.org/pdf/2306.08161.pdf](https://arxiv.org/pdf/2306.08161.pdf)
+h2oGPT is a large language model (LLM) fine-tuning framework and chatbot UI with document(s) question-answer capabilities.  Documents help to **ground** LLMs against hallucinations by providing them context relevant to the instruction.  h2oGPT is fully permissive Apache V2 open-source project for 100% private and secure use of LLMs and document embeddings for document question-answer.
+Welcome!  Join us and make an issue or a PR, and contribute to making the best fine-tuned LLMs, chatbot UI, and document question-answer framework!
+Turn ★ into ⭐ (top-right corner) if you like the project!
+<!--  cat README.md | ./gh-md-toc  -  But Help is heavily processed -->
+* [Supported OS and Hardware](#supported-os-and-hardware)
+* [Apache V2 ChatBot with LangChain Integration](#apache-v2-chatbot-with-langchain-integration)
+* [Apache V2 Data Preparation code, Training code, and Models](#apache-v2-data-preparation-code-training-code-and-models)
+* [Roadmap](#roadmap)
+* [Getting Started](#getting-started)
+   * [TLDR Install & Run](#tldr)
+   * [GPU (CUDA)](docs/README_GPU.md)
+   * [CPU](docs/README_CPU.md)
+   * [MACOS](docs/README_MACOS.md#macos)
+   * [Windows 10/11](docs/README_WINDOWS.md)
+   * [CLI chat](docs/README_CLI.md)
+   * [Gradio UI](docs/README_GRADIOUI.md)
+   * [Client API](docs/README_CLIENT.md)
+   * [Connect to Inference Servers](docs/README_InferenceServers.md)
+   * [Python Wheel](docs/README_WHEEL.md)
+* [Development](#development)
+* [Help](#help)
+   * [LangChain file types supported](docs/README_LangChain.md#supported-datatypes)
+   * [CLI Database control](docs/README_LangChain.md#database-creation)
+   * [Why h2oGPT for Doc Q&A](docs/README_LangChain.md#what-is-h2ogpts-langchain-integration-like)
+   * [FAQ](docs/FAQ.md)
+   * [Useful Links](docs/LINKS.md)
+   * [Fine-Tuning](docs/FINETUNE.md)
+   * [Docker](docs/INSTALL-DOCKER.md)
+   * [Triton](docs/TRITON.md)
+* [Acknowledgements](#acknowledgements)
+* [Why H2O.ai?](#why-h2oai)
+* [Disclaimer](#disclaimer)
+### Supported OS and Hardware
+[![GitHub license](https://img.shields.io/github/license/NVIDIA/nvidia-docker?style=flat-square)](https://raw.githubusercontent.com/h2oai/h2ogpt/main/LICENSE)
+![Linux](https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black)
+![macOS](https://img.shields.io/badge/mac%20os-000000?style=for-the-badge&logo=macos&logoColor=F0F0F0)
+![Windows](https://img.shields.io/badge/Windows-0078D6?style=for-the-badge&logo=windows&logoColor=white)
+![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white)
+**GPU** mode requires CUDA support via torch and transformers.  A 6.9B (or 12GB) model in 8-bit uses 8GB (or 13GB) of GPU memory. 8-bit or 4-bit precision can further reduce memory requirements down no more than about 6.5GB when asking a question about your documents (see [low-memory mode](docs/FAQ.md#low-memory-mode)).
+**CPU** mode uses GPT4ALL and LLaMa.cpp, e.g. gpt4all-j, requiring about 14GB of system RAM in typical use.
+GPU and CPU mode tested on variety of NVIDIA GPUs in Ubuntu 18-22, but any modern Linux variant should work.  MACOS support tested on Macbook Pro running Monterey v12.3.1 using CPU mode.
+### Apache V2 ChatBot with LangChain Integration
+- [**LangChain**](docs/README_LangChain.md) equipped Chatbot integration and streaming responses
+- **Persistent** database using Chroma or in-memory with FAISS
+- **Original** content url links and scores to rank content against query
+- **Private** offline database of any documents ([PDFs, Images, and many more](docs/README_LangChain.md#supported-datatypes))
+- **Upload** documents via chatbot into shared space or only allow scratch space
+- **Control** data sources and the context provided to LLM
+- **Efficient** use of context using instruct-tuned LLMs (no need for many examples)
+- **API** for client-server control
+- **CPU and GPU** support from variety of HF models, and CPU support using GPT4ALL and LLaMa cpp
+- **Linux, MAC, and Windows** support
+Light mode with soft colors talking to cat image:
+![Talk to Cat](docs/ui_talk_to_images.png)
+Dark mode with H2O.ai colors:
+<img src="docs/langchain.png" alt="VectorDB" title="VectorDB via LangChain">
+### Apache V2 Data Preparation code, Training code, and Models
+- **Variety** of models (h2oGPT, WizardLM, Vicuna, OpenAssistant, etc.) supported
+- **Fully Commercially** Apache V2 code, data and models
+- **High-Quality** data cleaning of large open-source instruction datasets
+- **LoRA** and **QLoRA** (low-rank approximation) efficient 4-bit, 8-bit and 16-bit fine-tuning and generation
+- **Large** (up to 65B parameters) models built on commodity or enterprise GPUs (single or multi node)
+- **Evaluate** performance using RLHF-based reward models
+https://user-images.githubusercontent.com/6147661/232924684-6c0e2dfb-2f24-4098-848a-c3e4396f29f6.mov
+All open-source datasets and models are posted on [🤗 H2O.ai's Hugging Face page](https://huggingface.co/h2oai/).
+Also check out [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio) for our no-code LLM fine-tuning framework!
+### Roadmap
+- Integration of code and resulting LLMs with downstream applications and low/no-code platforms
+- Complement h2oGPT chatbot with search and other APIs
+- High-performance distributed training of larger models on trillion tokens
+- Enhance the model's code completion, reasoning, and mathematical capabilities, ensure factual correctness, minimize hallucinations, and avoid repetitive output
+- Add other tools like search
+- Add agents for SQL and CSV question/answer
+### Getting Started
+First one needs a Python 3.10 environment.  For help installing a Python 3.10 environment, see [Install Python 3.10 Environment](docs/INSTALL.md#install-python-environment).  On newer Ubuntu systems and environment may be installed by just doing:
+```bash
+sudo apt-get install -y build-essential gcc python3.10-dev
+virtualenv -p python3 h2ogpt
+source h2ogpt/bin/activate
+```
+Check your installation by doing:
+```bash
+python --version # should say 3.10.xx
+pip --version  # should say pip 23.x.y ... (python 3.10)
+```
+On some systems, `pip` still refers back to the system one, then one can use `python -m pip` or `pip3` instead of `pip` or try `python3` instead of `python`.
+#### TLDR
+After Python 3.10 environment installed:
+```bash
+git clone https://github.com/h2oai/h2ogpt.git
+cd h2ogpt
+# fix any bad env
+pip uninstall -y pandoc pypandoc pypandoc-binary
+# broad support, but no training-time or data creation dependencies
+for fil in requirements.txt reqs_optional/requirements_optional_langchain.txt reqs_optional/requirements_optional_gpt4all.txt reqs_optional/requirements_optional_langchain.gpllike.txt reqs_optional/requirements_optional_langchain.urls.txt ; do pip install -r $fil ; done
+# Optional: support docx, pptx, ArXiv, etc.
+sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
+# Optional: for supporting unstructured package
+python -m nltk.downloader all
+````
+Place all documents in `user_path` or upload in UI.
+UI using GPU with at least 24GB with streaming:
+```bash
+python generate.py --base_model=h2oai/h2ogpt-oasst1-512-12b --load_8bit=True  --score_model=None --langchain_mode='UserData' --user_path=user_path
+```
+UI using CPU
+```bash
+python generate.py --base_model='llama' --prompt_type=wizard2 --score_model=None --langchain_mode='UserData' --user_path=user_path
+```
+### Development
+- To create a development environment for training and generation, follow the [installation instructions](docs/INSTALL.md).
+- To fine-tune any LLM models on your data, follow the [fine-tuning instructions](docs/FINETUNE.md).
+- To create a container for deployment, follow the [Docker instructions](docs/INSTALL-DOCKER.md).
+### Help
+- Flash attention support, see [Flash Attention](docs/INSTALL.md#flash-attention)
+- [Docker](docs/INSTALL-DOCKER.md#containerized-installation-for-inference-on-linux-gpu-servers) for inference.
+- [FAQs](docs/FAQ.md)
+- [README for LangChain](docs/README_LangChain.md)
+- More [Links](docs/LINKS.md), context, competitors, models, datasets
+### Acknowledgements
+* Some training code was based upon March 24 version of [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/).
+* Used high-quality created data by [OpenAssistant](https://open-assistant.io/).
+* Used base models by [EleutherAI](https://www.eleuther.ai/).
+* Used OIG data created by [LAION](https://laion.ai/blog/oig-dataset/).
+### Why H2O.ai?
+Our [Makers](https://h2o.ai/company/team/) at [H2O.ai](https://h2o.ai) have built several world-class Machine Learning, Deep Learning and AI platforms:
+- #1 open-source machine learning platform for the enterprise [H2O-3](https://github.com/h2oai/h2o-3)
+- The world's best AutoML (Automatic Machine Learning) with [H2O Driverless AI](https://h2o.ai/platform/ai-cloud/make/h2o-driverless-ai/)
+- No-Code Deep Learning with [H2O Hydrogen Torch](https://h2o.ai/platform/ai-cloud/make/hydrogen-torch/)
+- Document Processing with Deep Learning in [Document AI](https://h2o.ai/platform/ai-cloud/make/document-ai/)
+We also built platforms for deployment and monitoring, and for data wrangling and governance:
+- [H2O MLOps](https://h2o.ai/platform/ai-cloud/operate/h2o-mlops/) to deploy and monitor models at scale
+- [H2O Feature Store](https://h2o.ai/platform/ai-cloud/make/feature-store/) in collaboration with AT&T
+- Open-source Low-Code AI App Development Frameworks [Wave](https://wave.h2o.ai/) and [Nitro](https://nitro.h2o.ai/)
+- Open-source Python [datatable](https://github.com/h2oai/datatable/) (the engine for H2O Driverless AI feature engineering)
+Many of our customers are creating models and deploying them enterprise-wide and at scale in the [H2O AI Cloud](https://h2o.ai/platform/ai-cloud/):
+- Multi-Cloud or on Premises
+- [Managed Cloud (SaaS)](https://h2o.ai/platform/ai-cloud/managed)
+- [Hybrid Cloud](https://h2o.ai/platform/ai-cloud/hybrid)
+- [AI Appstore](https://docs.h2o.ai/h2o-ai-cloud/)
+We are proud to have over 25 (of the world's 280) [Kaggle Grandmasters](https://h2o.ai/company/team/kaggle-grandmasters/) call H2O home, including three Kaggle Grandmasters who have made it to world #1.
+### Disclaimer
+Please read this disclaimer carefully before using the large language model provided in this repository. Your use of the model signifies your agreement to the following terms and conditions.
+- Biases and Offensiveness: The large language model is trained on a diverse range of internet text data, which may contain biased, racist, offensive, or otherwise inappropriate content. By using this model, you acknowledge and accept that the generated content may sometimes exhibit biases or produce content that is offensive or inappropriate. The developers of this repository do not endorse, support, or promote any such content or viewpoints.
+- Limitations: The large language model is an AI-based tool and not a human. It may produce incorrect, nonsensical, or irrelevant responses. It is the user's responsibility to critically evaluate the generated content and use it at their discretion.
+- Use at Your Own Risk: Users of this large language model must assume full responsibility for any consequences that may arise from their use of the tool. The developers and contributors of this repository shall not be held liable for any damages, losses, or harm resulting from the use or misuse of the provided model.
+- Ethical Considerations: Users are encouraged to use the large language model responsibly and ethically. By using this model, you agree not to use it for purposes that promote hate speech, discrimination, harassment, or any form of illegal or harmful activities.
+- Reporting Issues: If you encounter any biased, offensive, or otherwise inappropriate content generated by the large language model, please report it to the repository maintainers through the provided channels. Your feedback will help improve the model and mitigate potential issues.
+- Changes to this Disclaimer: The developers of this repository reserve the right to modify or update this disclaimer at any time without prior notice. It is the user's responsibility to periodically review the disclaimer to stay informed about any changes.
+By using the large language model provided in this repository, you agree to accept and comply with the terms and conditions outlined in this disclaimer. If you do not agree with any part of this disclaimer, you should refrain from using the model and any content generated by it.
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=h2oai/h2ogpt&type=Timeline)](https://star-history.com/#h2oai/h2ogpt&Timeline)

__pycache__/enums.cpython-310.pyc ADDED Viewed

Binary file (2.34 kB). View file

__pycache__/generate.cpython-310.pyc ADDED Viewed

Binary file (63.9 kB). View file

__pycache__/gpt4all_llm.cpython-310.pyc ADDED Viewed

Binary file (8.27 kB). View file

__pycache__/gpt_langchain.cpython-310.pyc ADDED Viewed

Binary file (60.1 kB). View file

__pycache__/gradio_runner.cpython-310.pyc ADDED Viewed

Binary file (67.4 kB). View file

__pycache__/gradio_themes.cpython-310.pyc ADDED Viewed

Binary file (6.64 kB). View file

__pycache__/h2oai_pipeline.cpython-310.pyc ADDED Viewed

Binary file (5.3 kB). View file

__pycache__/loaders.cpython-310.pyc ADDED Viewed

Binary file (1.66 kB). View file

__pycache__/prompter.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

__pycache__/stopping.cpython-310.pyc ADDED Viewed

Binary file (2.98 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (28.5 kB). View file

__pycache__/utils_langchain.cpython-310.pyc ADDED Viewed

Binary file (2.76 kB). View file

blog/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# Building the World's Best Open-Source Large Language Model: H2O.ai's Journey
+by Arno Candel, PhD, CTO H2O.ai, April 19 2023
+At H2O.ai, we pride ourselves on developing world-class Machine Learning, Deep Learning, and AI platforms. We released H2O, the most widely used open-source distributed and scalable machine learning platform, before XGBoost, TensorFlow and PyTorch existed. H2O.ai is home to over 25 Kaggle grandmasters, including the current #1. In 2017, we used GPUs to create the world's best AutoML in H2O Driverless AI. We have witnessed first-hand how Large Language Models (LLMs) have taken over the world by storm.
+We are proud to announce that we are building h2oGPT, an LLM that not only excels in performance but is also fully open-source and commercially usable, providing a valuable resource for developers, researchers, and organizations worldwide.
+In this blog, we'll explore our journey in building h2oGPT in our effort to further democratize AI.
+## Why Open-Source LLMs?
+While LLMs like OpenAI's ChatGPT/GPT-4, Anthropic's Claude, Microsoft's Bing AI Chat, Google's Bard, and Cohere are powerful and effective, they have certain limitations compared to open-source LLMs:
+1. **Data Privacy and Security**: Using hosted LLMs requires sending data to external servers. This can raise concerns about data privacy, security, and compliance, especially for sensitive information or industries with strict regulations.
+2. **Dependency and Customization**: Hosted LLMs often limit the extent of customization and control, as users rely on the service provider's infrastructure and predefined models. Open-source LLMs allow users to tailor the models to their specific needs, deploy on their own infrastructure, and even modify the underlying code.
+3. **Cost and Scalability**: Hosted LLMs usually come with usage fees, which can increase significantly with large-scale applications. Open-source LLMs can be more cost-effective, as users can scale the models on their own infrastructure without incurring additional costs from the service provider.
+4. **Access and Availability**: Hosted LLMs may be subject to downtime or limited availability, affecting users' access to the models. Open-source LLMs can be deployed on-premises or on private clouds, ensuring uninterrupted access and reducing reliance on external providers.
+Overall, open-source LLMs offer greater flexibility, control, and cost-effectiveness, while addressing data privacy and security concerns. They foster a competitive landscape in the AI industry and empower users to innovate and customize models to suit their specific needs.
+## The H2O.ai LLM Ecosystem
+Our open-source LLM ecosystem currently includes the following components:
+1. **Code, data, and models**: Fully permissive, commercially usable [code](https://github.com/h2oai/h2ogpt), curated fine-tuning [data](https://huggingface.co/h2oai), and fine-tuned [models](https://huggingface.co/h2oai) ranging from 7 to 20 billion parameters.
+2. **State-of-the-art fine-tuning**: We provide code for highly efficient fine-tuning, including targeted data preparation, prompt engineering, and computational optimizations to fine-tune LLMs with up to 20 billion parameters (even larger models expected soon) in hours on commodity hardware or enterprise servers. Techniques like low-rank approximations (LoRA) and data compression allow computational savings of several orders of magnitude.
+3. **Chatbot**: We provide code to run a multi-tenant chatbot on GPU servers, with an easily shareable end-point and a Python client API, allowing you to evaluate and compare the performance of fine-tuned LLMs.
+4. **H2O LLM Studio**: Our no-code LLM fine-tuning framework created by the world's top Kaggle grandmasters makes it even easier to fine-tune and evaluate LLMs.
+Everything we release is based on fully permissive data and models, with all code open-sourced, enabling broader access for businesses and commercial products without legal concerns, thus expanding access to cutting-edge AI while adhering to licensing requirements.
+## Roadmap and Future Plans
+We have an ambitious roadmap for our LLM ecosystem, including:
+1. Integration with downstream applications and low/no-code platforms (H2O Document AI, H2O LLM Studio, etc.)
+2. Improved validation and benchmarking frameworks of LLMs
+3. Complementing our chatbot with search and other APIs (LangChain, etc.)
+4. Contribute to large-scale data cleaning efforts (Open Assistant, Stability AI, RedPajama, etc.)
+5. High-performance distributed training of larger models on trillion tokens
+6. High-performance scalable on-premises hosting for high-throughput endpoints
+7. Improvements in code completion, reasoning, mathematics, factual correctness, hallucinations, and reducing repetitions
+## Getting Started with H2O.ai's LLMs
+You can [Chat with h2oGPT](https://gpt.h2o.ai/) right now!
+https://user-images.githubusercontent.com/6147661/232924684-6c0e2dfb-2f24-4098-848a-c3e4396f29f6.mov
+![](https://user-images.githubusercontent.com/6147661/233239878-de3b0fce-5425-4189-8095-5313c7817d58.png)
+![](https://user-images.githubusercontent.com/6147661/233239861-e99f238c-dd5d-4dd7-ac17-6367f91f86ac.png)
+To start using our LLM as a developer, follow the steps below:
+1. Clone the repository: `git clone https://github.com/h2oai/h2ogpt.git`
+2. Change to the repository directory: `cd h2ogpt`
+3. Install the requirements: `pip install -r requirements.txt`
+4. Run the chatbot: `python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-256-6_9b`
+5. Open your browser at `http://0.0.0.0:7860` or the public live URL printed by the server.
+For more information, visit [h2oGPT GitHub page](https://github.com/h2oai/h2ogpt), [H2O.ai's Hugging Face page](https://huggingface.co/h2oai) and [H2O LLM Studio GitHub page](https://github.com/h2oai/h2o-llmstudio).
+Join us on this exciting journey as we continue to improve and expand the capabilities of our open-source LLM ecosystem!
+## Acknowledgements
+We appreciate the work by many open-source contributors, especially:
+* [H2O.ai makers](https://h2o.ai/company/team/)
+* [Alpaca-LoRA](https://github.com/tloen/alpaca-lora/)
+* [LoRA](https://github.com/microsoft/LoRA/)
+* [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca/)
+* [Hugging Face](https://huggingface.co/)
+* [OpenAssistant](https://open-assistant.io/)
+* [EleutherAI](https://www.eleuther.ai/)
+* [LAION](https://laion.ai/blog/oig-dataset/)
+* [BigScience](https://github.com/bigscience-workshop/bigscience/)
+* [LLaMa](https://github.com/facebookresearch/llama/)
+* [StableLM](https://github.com/Stability-AI/StableLM/)
+* [Vicuna](https://github.com/lm-sys/FastChat/)

ci/jenkinsfile ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/groovy
+@Library('test-shared-library@dai_pipeline') _
+import ai.h2o.ci.buildsummary.StagesSummary
+import groovy.json.JsonOutput
+buildSummary('https://github.com/h2oai/h2ogpt', true)
+buildSummary.get().addStagesSummary(this, new StagesSummary())
+def ALL_TESTS = [
+        "test_osx": [
+            install_deps: "TRAINING",
+            test_target: "test_imports",
+            node: "osx",
+            test_markers: "not need_tokens and not need_gpu",
+            timeout: 90,
+            use_docker: false,
+            env: ['PYTHON_BINARY=/Users/jenkins/anaconda/envs/h2ogpt-py3.10/bin/python']
+        ],
+        "test_all": [
+            install_deps: "TRAINING,WIKI_EXTRA",
+            test_target: "test",
+            test_markers: "not need_tokens and not need_gpu",
+            node: "DAIDEV-GPU || DAIDEV-2GPU",
+            timeout: 90,
+            use_docker: true,
+            env: []
+        ],
+]
+pipeline {
+    agent none
+    parameters {
+        booleanParam(name: 'skipTesting', defaultValue: false, description: 'Skip testing')
+        text(name: "testTargets", defaultValue: "${ALL_TESTS.keySet().join('\n')}", description: "A select set of tests to run")
+        booleanParam(name: 'publish', defaultValue: false, description: 'Upload to HF')
+    }
+    options {
+        ansiColor('xterm')
+        timestamps()
+    }
+    stages {
+        stage('Build') {
+            agent {
+                label "linux && docker"
+            }
+            steps {
+                script {
+                    def shortHash = sh(returnStdout: true, script: 'git rev-parse --short HEAD').trim()
+                    def commitMsg = sh(returnStdout: true, script: 'git log -1 --pretty=format:"[%an] %s"').trim()
+                    currentBuild.displayName = "${env.BUILD_ID} - [${shortHash}]"
+                    currentBuild.description = "${commitMsg}"
+                    sh "make docker_build"
+                    docker.image("harbor.h2o.ai/library/python:3.10").inside("--entrypoint='' --security-opt seccomp=unconfined -e USE_WHEEL=1 -e HOME=${WORKSPACE}") {
+                        sh "make clean dist"
+                    }
+                    archiveArtifacts allowEmptyArchive: true, artifacts: "dist/h2ogpt-*.whl"
+                    stash includes: "dist/h2ogpt-*.whl", name: "wheel_file"
+                }
+            }
+        }
+        stage('Tests') {
+            when {
+                anyOf {
+                    expression { return !params.skipTesting }
+                }
+                beforeAgent true
+            }
+            agent {
+                label "linux && docker"
+            }
+            steps {
+                script {
+                    def testTargets = [:]
+                    params.testTargets.split('\n').findAll{ it.contains("test_") }.each { testName ->
+                        testTargets[testName] = {
+                            node("${ALL_TESTS[testName].node}") {
+                                buildSummary.stageWithSummary("${testName}", "${testName}") {
+                                    buildSummary.setStageUrl("${testName}")
+                                    timeout(time: ALL_TESTS[testName].timeout, unit: 'MINUTES') {
+                                        script {
+                                            try {
+                                                dir("${testName}") {
+                                                    withEnv(ALL_TESTS[testName].env + ["PYTEST_TEST_NAME=_${testName}", "IS_PR_BUILD=${isPrBranch()}", "USE_WHEEL=1"]) {
+                                                        // cleanup and force the use of the installed wheel
+                                                        deleteDir()
+                                                        checkout scm
+                                                        unstash "wheel_file"
+                                                        sh "rm -rf *.py spaces models"
+                                                        // pull runtime details
+                                                        def dockerImage = sh(returnStdout: true, script: "make print-DOCKER_TEST_IMAGE").trim()
+                                                        def nvidiaSmiExitCode = sh(returnStdout: false, returnStatus: true, script: "nvidia-smi")
+                                                        // def dockerRuntime = "${nvidiaSmiExitCode}" == "0" ? "--runtime nvidia" : ""
+                                                        def dockerRuntime = ""  // TODO: keep until lab machines are upgraded
+                                                        if (ALL_TESTS[testName].use_docker) {
+                                                            docker.image("${dockerImage}").inside("--entrypoint='' --security-opt seccomp=unconfined --ulimit core=-1 --init --pid=host -e USE_WHEEL=1 -e HOME=${WORKSPACE}/${testName} ${dockerRuntime}") {
+                                                                sh "nvidia-smi || true"
+                                                                sh "SKIP_MANUAL_TESTS=1 PYTHON_BINARY=/usr/bin/python3.10 make install"
+                                                                sh "SKIP_MANUAL_TESTS=1 PYTHON_BINARY=/usr/bin/python3.10 make install-${ALL_TESTS[testName].install_deps}"
+                                                                sh """DEFAULT_MARKERS="${ALL_TESTS[testName].test_markers}" SKIP_MANUAL_TESTS=1 PYTHON_BINARY=/usr/bin/python3.10 make ${ALL_TESTS[testName].test_target}"""
+                                                            }
+                                                        } else {
+                                                            sh "make venv"
+                                                            sh "SKIP_MANUAL_TESTS=1 PYTHON_BINARY=${WORKSPACE}/${testName}/venv/bin/python make install"
+                                                            sh "SKIP_MANUAL_TESTS=1 PYTHON_BINARY=${WORKSPACE}/${testName}/venv/bin/python make install-${ALL_TESTS[testName].install_deps}"
+                                                            sh """DEFAULT_MARKERS="${ALL_TESTS[testName].test_markers}" SKIP_MANUAL_TESTS=1 PYTHON_BINARY=${WORKSPACE}/${testName}/venv/bin/python make ${ALL_TESTS[testName].test_target}"""
+                                                        }
+                                                    }
+                                                }
+                                            } catch (e) {
+                                                throw e
+                                            } finally {
+                                                sh "mv ${testName}/test_report.xml ${testName}/${testName}_report.xml"
+                                                archiveArtifacts allowEmptyArchive: true, artifacts: "${testName}/${testName}_report.xml"
+                                                junit testResults: "${testName}/${testName}_report.xml", keepLongStdio: true, allowEmptyResults: true
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    parallel(testTargets)
+                }
+            }
+        }
+        stage('Publish') {
+            when {
+                anyOf {
+                    expression { return params.publish }
+                }
+                beforeAgent true
+            }
+            agent {
+                label "linux && docker"
+            }
+            steps {
+                script {
+                    sh "make IS_PR_BUILD=${isPrBranch()} BUILD_NUMBER=${env.BUILD_ID} BUILD_BASE_NAME=${env.JOB_BASE_NAME} publish"
+                }
+            }
+        }
+    }
+}
+def isPrBranch() {
+    return (env.CHANGE_BRANCH != null && env.CHANGE_BRANCH != '') ||
+            (env.BRANCH_NAME != null && env.BRANCH_NAME.startsWith("PR-"))
+}

cli.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import copy
+import torch
+from generate import eval_func_param_names, get_score_model, get_model, evaluate, check_locals
+from prompter import non_hf_types
+from utils import clear_torch_cache, NullContext, get_kwargs
+def run_cli(  # for local function:
+        base_model=None, lora_weights=None, inference_server=None,
+        debug=None, chat_context=None,
+        examples=None, memory_restriction_level=None,
+        # for get_model:
+        score_model=None, load_8bit=None, load_4bit=None, load_half=None, infer_devices=None, tokenizer_base_model=None,
+        gpu_id=None, local_files_only=None, resume_download=None, use_auth_token=None,
+        trust_remote_code=None, offload_folder=None, compile_model=None,
+        # for some evaluate args
+        stream_output=None, prompt_type=None, prompt_dict=None,
+        temperature=None, top_p=None, top_k=None, num_beams=None,
+        max_new_tokens=None, min_new_tokens=None, early_stopping=None, max_time=None, repetition_penalty=None,
+        num_return_sequences=None, do_sample=None, chat=None,
+        langchain_mode=None, document_choice=None, top_k_docs=None, chunk=None, chunk_size=None,
+        # for evaluate kwargs
+        src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None,
+        model_state0=None,
+        max_max_new_tokens=None,
+        is_public=None,
+        max_max_time=None,
+        raise_generate_gpu_exceptions=None, load_db_if_exists=None, dbs=None, user_path=None,
+        detect_user_path_changes_every_query=None,
+        use_openai_embedding=None, use_openai_model=None, hf_embedding_model=None,
+        db_type=None, n_jobs=None, first_para=None, text_limit=None, verbose=None, cli=None, reverse_docs=None,
+        use_cache=None,
+        auto_reduce_chunks=None, max_chunks=None, model_lock=None, force_langchain_evaluate=None,
+        model_state_none=None,
+        # unique to this function:
+        cli_loop=None,
+):
+    check_locals(**locals())
+    score_model = ""  # FIXME: For now, so user doesn't have to pass
+    n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
+    device = 'cpu' if n_gpus == 0 else 'cuda'
+    context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
+    with context_class(device):
+        from functools import partial
+        # get score model
+        smodel, stokenizer, sdevice = get_score_model(reward_type=True,
+                                                      **get_kwargs(get_score_model, exclude_names=['reward_type'],
+                                                                   **locals()))
+        model, tokenizer, device = get_model(reward_type=False,
+                                             **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
+        model_dict = dict(base_model=base_model, tokenizer_base_model=tokenizer_base_model, lora_weights=lora_weights,
+                          inference_server=inference_server, prompt_type=prompt_type, prompt_dict=prompt_dict)
+        model_state = dict(model=model, tokenizer=tokenizer, device=device)
+        model_state.update(model_dict)
+        my_db_state = [None]
+        fun = partial(evaluate, model_state, my_db_state,
+                      **get_kwargs(evaluate, exclude_names=['model_state', 'my_db_state'] + eval_func_param_names,
+                                   **locals()))
+        example1 = examples[-1]  # pick reference example
+        all_generations = []
+        while True:
+            clear_torch_cache()
+            instruction = input("\nEnter an instruction: ")
+            if instruction == "exit":
+                break
+            eval_vars = copy.deepcopy(example1)
+            eval_vars[eval_func_param_names.index('instruction')] = \
+                eval_vars[eval_func_param_names.index('instruction_nochat')] = instruction
+            eval_vars[eval_func_param_names.index('iinput')] = \
+                eval_vars[eval_func_param_names.index('iinput_nochat')] = ''  # no input yet
+            eval_vars[eval_func_param_names.index('context')] = ''  # no context yet
+            # grab other parameters, like langchain_mode
+            for k in eval_func_param_names:
+                if k in locals():
+                    eval_vars[eval_func_param_names.index(k)] = locals()[k]
+            gener = fun(*tuple(eval_vars))
+            outr = ''
+            res_old = ''
+            for gen_output in gener:
+                res = gen_output['response']
+                extra = gen_output['sources']
+                if base_model not in non_hf_types or base_model in ['llama']:
+                    if not stream_output:
+                        print(res)
+                    else:
+                        # then stream output for gradio that has full output each generation, so need here to show only new chars
+                        diff = res[len(res_old):]
+                        print(diff, end='', flush=True)
+                        res_old = res
+                    outr = res  # don't accumulate
+                else:
+                    outr += res  # just is one thing
+                    if extra:
+                        # show sources at end after model itself had streamed to std rest of response
+                        print(extra, flush=True)
+            all_generations.append(outr + '\n')
+            if not cli_loop:
+                break
+    return all_generations

client/.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+### Generated files ###
+enums.py
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

client/Makefile ADDED Viewed

	@@ -0,0 +1,25 @@

+PACKAGE_NAME    := $(firstword $(shell poetry version))
+PACKAGE_DIR     := $(subst -,_,$(PACKAGE_NAME))
+PACKAGE_VERSION := $(shell poetry version --short)
+generate_sources:
+	cp -f ./../enums.py "$(PACKAGE_DIR)/enums.py"
+.PHONY: setup
+setup:
+	poetry install
+.PHONY: lint
+lint: generate_sources
+	poetry run black .
+	poetry run isort .
+	poetry run flake8 "$(PACKAGE_DIR)" "tests" || true
+	poetry run mypy --show-error-codes --pretty .
+.PHONY: test
+test: generate_sources
+	poetry run pytest
+.PHONY: build
+build: generate_sources
+	poetry build

client/README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# h2oGPT Client
+A Python thin-client for h2oGPT.
+## Installation
+### Prerequisites
+- Python 3.8+
+- [Poetry](https://python-poetry.org/docs/#installation) - A dependency management and packaging tool for Python
+### Setup environment
+```shell
+cd client
+pip install poetry==1.5.1
+make setup
+make lint
+make build
+# install (choose version if multiple builds in dist directory)
+pip install dist/h2ogpt_client-*-py3-none-any.whl
+# test
+cd ..
+pytest -s -v --forked client
+```
+## Usage
+```python
+from h2ogpt_client import Client
+client = Client("http://0.0.0.0:7860")
+# text completion
+response = client.text_completion.create("Hello world")
+response = await client.text_completion.create_async("Hello world")
+# chat completion
+chat_context = client.chat_completion.create()
+chat = chat_context.chat("Hey!")
+print(chat["user"])  # prints user prompt, i.e. "Hey!"
+print(chat["gpt"])   # prints reply of the h2oGPT
+chat_history = chat_context.chat_history()
+```
+:warning: **Note**: Client APIs are still evolving. Hence, APIs can be changed without prior warnings.

client/h2ogpt_client/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from h2ogpt_client.core import Client
+from h2ogpt_client.enums import LangChainMode, PromptType
+__all__ = ["Client", "PromptType", "LangChainMode"]

client/h2ogpt_client/core.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import asyncio
+import collections
+from typing import Any, Dict, List, Optional, OrderedDict, Tuple
+import gradio_client  # type: ignore
+from h2ogpt_client import enums
+class Client:
+    def __init__(self, server_url: str, huggingface_token: Optional[str] = None):
+        self._client = gradio_client.Client(
+            src=server_url, hf_token=huggingface_token, serialize=False, verbose=False
+        )
+        self._text_completion = TextCompletion(self)
+        self._chat_completion = ChatCompletion(self)
+    @property
+    def text_completion(self) -> "TextCompletion":
+        return self._text_completion
+    @property
+    def chat_completion(self) -> "ChatCompletion":
+        return self._chat_completion
+    def _predict(self, *args, api_name: str) -> Any:
+        return self._client.submit(*args, api_name=api_name).result()
+    async def _predict_async(self, *args, api_name: str) -> str:
+        return await asyncio.wrap_future(self._client.submit(*args, api_name=api_name))
+class TextCompletion:
+    """Text completion"""
+    def __init__(self, client: Client):
+        self._client = client
+    def create(
+        self,
+        prompt: str,
+        prompt_type: enums.PromptType = enums.PromptType.plain,
+        input_context_for_instruction: str = "",
+        enable_sampler=False,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        beams: float = 1.0,
+        early_stopping: bool = False,
+        min_output_length: int = 0,
+        max_output_length: int = 128,
+        max_time: int = 180,
+        repetition_penalty: float = 1.07,
+        number_returns: int = 1,
+        system_pre_context: str = "",
+        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
+    ) -> str:
+        """
+        Creates a new text completion.
+        :param prompt: text prompt to generate completions for
+        :param prompt_type: type of the prompt
+        :param input_context_for_instruction: input context for instruction
+        :param enable_sampler: enable or disable the sampler, required for use of
+                temperature, top_p, top_k
+        :param temperature: What sampling temperature to use, between 0 and 3.
+                Lower values will make it more focused and deterministic, but may lead
+                to repeat. Higher values will make the output more creative, but may
+                lead to hallucinations.
+        :param top_p: cumulative probability of tokens to sample from
+        :param top_k: number of tokens to sample from
+        :param beams: Number of searches for optimal overall probability.
+                Higher values uses more GPU memory and compute.
+        :param early_stopping: whether to stop early or not in beam search
+        :param min_output_length: minimum output length
+        :param max_output_length: maximum output length
+        :param max_time: maximum time to search optimal output
+        :param repetition_penalty: penalty for repetition
+        :param number_returns:
+        :param system_pre_context: directly pre-appended without prompt processing
+        :param langchain_mode: LangChain mode
+        :return: response from the model
+        """
+        # Not exposed parameters.
+        instruction = ""  # empty when chat_mode is False
+        input = ""  # only chat_mode is True
+        stream_output = False
+        prompt_dict = ""  # empty as prompt_type cannot be 'custom'
+        chat_mode = False
+        langchain_top_k_docs = 4  # number of document chunks; not public
+        langchain_enable_chunk = True  # whether to chunk documents; not public
+        langchain_chunk_size = 512  # chunk size for document chunking; not public
+        langchain_document_choice = ["All"]
+        return self._client._predict(
+            instruction,
+            input,
+            system_pre_context,
+            stream_output,
+            prompt_type.value,
+            prompt_dict,
+            temperature,
+            top_p,
+            top_k,
+            beams,
+            max_output_length,
+            min_output_length,
+            early_stopping,
+            max_time,
+            repetition_penalty,
+            number_returns,
+            enable_sampler,
+            chat_mode,
+            prompt,
+            input_context_for_instruction,
+            langchain_mode.value,
+            langchain_top_k_docs,
+            langchain_enable_chunk,
+            langchain_chunk_size,
+            langchain_document_choice,
+            api_name="/submit_nochat",
+        )
+    async def create_async(
+        self,
+        prompt: str,
+        prompt_type: enums.PromptType = enums.PromptType.plain,
+        input_context_for_instruction: str = "",
+        enable_sampler=False,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        beams: float = 1.0,
+        early_stopping: bool = False,
+        min_output_length: int = 0,
+        max_output_length: int = 128,
+        max_time: int = 180,
+        repetition_penalty: float = 1.07,
+        number_returns: int = 1,
+        system_pre_context: str = "",
+        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
+    ) -> str:
+        """
+        Creates a new text completion asynchronously.
+        :param prompt: text prompt to generate completions for
+        :param prompt_type: type of the prompt
+        :param input_context_for_instruction: input context for instruction
+        :param enable_sampler: enable or disable the sampler, required for use of
+                temperature, top_p, top_k
+        :param temperature: What sampling temperature to use, between 0 and 3.
+                Lower values will make it more focused and deterministic, but may lead
+                to repeat. Higher values will make the output more creative, but may
+                lead to hallucinations.
+        :param top_p: cumulative probability of tokens to sample from
+        :param top_k: number of tokens to sample from
+        :param beams: Number of searches for optimal overall probability.
+                Higher values uses more GPU memory and compute.
+        :param early_stopping: whether to stop early or not in beam search
+        :param min_output_length: minimum output length
+        :param max_output_length: maximum output length
+        :param max_time: maximum time to search optimal output
+        :param repetition_penalty: penalty for repetition
+        :param number_returns:
+        :param system_pre_context: directly pre-appended without prompt processing
+        :param langchain_mode: LangChain mode
+        :return: response from the model
+        """
+        # Not exposed parameters.
+        instruction = ""  # empty when chat_mode is False
+        input = ""  # only chat_mode is True
+        stream_output = False
+        prompt_dict = ""  # empty as prompt_type cannot be 'custom'
+        chat_mode = False
+        langchain_top_k_docs = 4  # number of document chunks; not public
+        langchain_enable_chunk = True  # whether to chunk documents; not public
+        langchain_chunk_size = 512  # chunk size for document chunking; not public
+        langchain_document_choice = ["All"]  # not public
+        return await self._client._predict_async(
+            instruction,
+            input,
+            system_pre_context,
+            stream_output,
+            prompt_type.value,
+            prompt_dict,
+            temperature,
+            top_p,
+            top_k,
+            beams,
+            max_output_length,
+            min_output_length,
+            early_stopping,
+            max_time,
+            repetition_penalty,
+            number_returns,
+            enable_sampler,
+            chat_mode,
+            prompt,
+            input_context_for_instruction,
+            langchain_mode.value,
+            langchain_top_k_docs,
+            langchain_enable_chunk,
+            langchain_chunk_size,
+            langchain_document_choice,
+            api_name="/submit_nochat",
+        )
+class ChatCompletion:
+    """Chat completion"""
+    def __init__(self, client: Client):
+        self._client = client
+    def create(
+        self,
+        prompt_type: enums.PromptType = enums.PromptType.plain,
+        input_context_for_instruction: str = "",
+        enable_sampler=False,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        beams: float = 1.0,
+        early_stopping: bool = False,
+        min_output_length: int = 0,
+        max_output_length: int = 128,
+        max_time: int = 180,
+        repetition_penalty: float = 1.07,
+        number_returns: int = 1,
+        system_pre_context: str = "",
+        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
+    ) -> "ChatContext":
+        """
+        Creates a new text completion asynchronously.
+        :param prompt_type: type of the prompt
+        :param input_context_for_instruction: input context for instruction
+        :param enable_sampler: enable or disable the sampler, required for use of
+                temperature, top_p, top_k
+        :param temperature: What sampling temperature to use, between 0 and 3.
+                Lower values will make it more focused and deterministic, but may lead
+                to repeat. Higher values will make the output more creative, but may
+                lead to hallucinations.
+        :param top_p: cumulative probability of tokens to sample from
+        :param top_k: number of tokens to sample from
+        :param beams: Number of searches for optimal overall probability.
+                Higher values uses more GPU memory and compute.
+        :param early_stopping: whether to stop early or not in beam search
+        :param min_output_length: minimum output length
+        :param max_output_length: maximum output length
+        :param max_time: maximum time to search optimal output
+        :param repetition_penalty: penalty for repetition
+        :param number_returns:
+        :param system_pre_context: directly pre-appended without prompt processing
+        :param langchain_mode: LangChain mode
+        :return: a chat context with given parameters
+        """
+        kwargs = collections.OrderedDict(
+            instruction=None,  # future prompts
+            input="",  # ??
+            system_pre_context=system_pre_context,
+            stream_output=False,
+            prompt_type=prompt_type.value,
+            prompt_dict="",  # empty as prompt_type cannot be 'custom'
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            beams=beams,
+            max_output_length=max_output_length,
+            min_output_length=min_output_length,
+            early_stopping=early_stopping,
+            max_time=max_time,
+            repetition_penalty=repetition_penalty,
+            number_returns=number_returns,
+            enable_sampler=enable_sampler,
+            chat_mode=True,
+            instruction_nochat="",  # empty when chat_mode is True
+            input_context_for_instruction=input_context_for_instruction,
+            langchain_mode=langchain_mode.value,
+            langchain_top_k_docs=4,  # number of document chunks; not public
+            langchain_enable_chunk=True,  # whether to chunk documents; not public
+            langchain_chunk_size=512,  # chunk size for document chunking; not public
+            langchain_document_choice=["All"],  # not public
+            chatbot=[],  # chat history
+        )
+        return ChatContext(self._client, kwargs)
+class ChatContext:
+    """ "Chat context"""
+    def __init__(self, client: Client, kwargs: OrderedDict[str, Any]):
+        self._client = client
+        self._kwargs = kwargs
+    def chat(self, prompt: str) -> Dict[str, str]:
+        """
+        Chat with the GPT.
+        :param prompt: text prompt to generate completions for
+        :returns chat reply
+        """
+        self._kwargs["instruction"] = prompt
+        self._kwargs["chatbot"] += [[prompt, None]]
+        response: Tuple[List[List[str]], str] = self._client._predict(
+            *self._kwargs.values(), api_name="/instruction_bot"
+        )
+        self._kwargs["chatbot"][-1][1] = response[0][-1][1]
+        return {"user": response[0][-1][0], "gpt": response[0][-1][1]}
+    def chat_history(self) -> List[Dict[str, str]]:
+        """Returns the full chat history."""
+        return [{"user": i[0], "gpt": i[1]} for i in self._kwargs["chatbot"]]

client/poetry.lock ADDED Viewed

	@@ -0,0 +1,876 @@

+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+[[package]]
+name = "anyio"
+version = "3.6.2"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+category = "main"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+    {file = "anyio-3.6.2-py3-none-any.whl", hash = "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"},
+    {file = "anyio-3.6.2.tar.gz", hash = "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421"},
+]
+[package.dependencies]
+idna = ">=2.8"
+sniffio = ">=1.1"
+[package.extras]
+doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"]
+trio = ["trio (>=0.16,<0.22)"]
+[[package]]
+name = "black"
+version = "23.3.0"
+description = "The uncompromising code formatter."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
+    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
+    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
+    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
+    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
+    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
+    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
+    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
+    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
+    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
+    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
+    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
+    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
+    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
+]
+[package.dependencies]
+click = ">=8.0.0"
+mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
+pathspec = ">=0.9.0"
+platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
+[package.extras]
+colorama = ["colorama (>=0.4.3)"]
+d = ["aiohttp (>=3.7.4)"]
+jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
+uvloop = ["uvloop (>=0.15.2)"]
+[[package]]
+name = "certifi"
+version = "2023.5.7"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
+    {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
+]
+[[package]]
+name = "charset-normalizer"
+version = "3.1.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"},
+    {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"},
+]
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+[[package]]
+name = "exceptiongroup"
+version = "1.1.1"
+description = "Backport of PEP 654 (exception groups)"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
+    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
+]
+[package.extras]
+test = ["pytest (>=6)"]
+[[package]]
+name = "filelock"
+version = "3.12.0"
+description = "A platform independent file lock."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "filelock-3.12.0-py3-none-any.whl", hash = "sha256:ad98852315c2ab702aeb628412cbf7e95b7ce8c3bf9565670b4eaecf1db370a9"},
+    {file = "filelock-3.12.0.tar.gz", hash = "sha256:fc03ae43288c013d2ea83c8597001b1129db351aad9c57fe2409327916b8e718"},
+]
+[package.extras]
+docs = ["furo (>=2023.3.27)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"]
+[[package]]
+name = "flake8"
+version = "5.0.4"
+description = "the modular source code checker: pep8 pyflakes and co"
+category = "dev"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
+    {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
+]
+[package.dependencies]
+mccabe = ">=0.7.0,<0.8.0"
+pycodestyle = ">=2.9.0,<2.10.0"
+pyflakes = ">=2.5.0,<2.6.0"
+[[package]]
+name = "flake8-pyproject"
+version = "1.2.3"
+description = "Flake8 plug-in loading the configuration from pyproject.toml"
+category = "dev"
+optional = false
+python-versions = ">= 3.6"
+files = [
+    {file = "flake8_pyproject-1.2.3-py3-none-any.whl", hash = "sha256:6249fe53545205af5e76837644dc80b4c10037e73a0e5db87ff562d75fb5bd4a"},
+]
+[package.dependencies]
+Flake8 = ">=5"
+TOMLi = {version = "*", markers = "python_version < \"3.11\""}
+[package.extras]
+dev = ["pyTest", "pyTest-cov"]
+[[package]]
+name = "fsspec"
+version = "2023.5.0"
+description = "File-system specification"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fsspec-2023.5.0-py3-none-any.whl", hash = "sha256:51a4ad01a5bb66fcc58036e288c0d53d3975a0df2a5dc59a93b59bade0391f2a"},
+    {file = "fsspec-2023.5.0.tar.gz", hash = "sha256:b3b56e00fb93ea321bc9e5d9cf6f8522a0198b20eb24e02774d329e9c6fb84ce"},
+]
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+devel = ["pytest", "pytest-cov"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+tqdm = ["tqdm"]
+[[package]]
+name = "gradio-client"
+version = "0.2.6"
+description = "Python library for easily interacting with trained machine learning models"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "gradio_client-0.2.6-py3-none-any.whl", hash = "sha256:daa3e2f7697cc07821ccefa7486b923e71c1e459d5b3c6e35318f343b566d789"},
+    {file = "gradio_client-0.2.6.tar.gz", hash = "sha256:a5d5c5799ce33ae3107e1d30992c27050f50506f15dd70481a39b13ac47e2613"},
+]
+[package.dependencies]
+fsspec = "*"
+httpx = "*"
+huggingface-hub = ">=0.13.0"
+packaging = "*"
+requests = "*"
+typing-extensions = "*"
+websockets = "*"
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+[[package]]
+name = "httpcore"
+version = "0.17.0"
+description = "A minimal low-level HTTP client."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "httpcore-0.17.0-py3-none-any.whl", hash = "sha256:0fdfea45e94f0c9fd96eab9286077f9ff788dd186635ae61b312693e4d943599"},
+    {file = "httpcore-0.17.0.tar.gz", hash = "sha256:cc045a3241afbf60ce056202301b4d8b6af08845e3294055eb26b09913ef903c"},
+]
+[package.dependencies]
+anyio = ">=3.0,<5.0"
+certifi = "*"
+h11 = ">=0.13,<0.15"
+sniffio = ">=1.0.0,<2.0.0"
+[package.extras]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
+[[package]]
+name = "httpx"
+version = "0.24.0"
+description = "The next generation HTTP client."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"},
+    {file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"},
+]
+[package.dependencies]
+certifi = "*"
+httpcore = ">=0.15.0,<0.18.0"
+idna = "*"
+sniffio = "*"
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (>=1.0.0,<2.0.0)"]
+[[package]]
+name = "huggingface-hub"
+version = "0.14.1"
+description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "huggingface_hub-0.14.1-py3-none-any.whl", hash = "sha256:9fc619170d800ff3793ad37c9757c255c8783051e1b5b00501205eb43ccc4f27"},
+    {file = "huggingface_hub-0.14.1.tar.gz", hash = "sha256:9ab899af8e10922eac65e290d60ab956882ab0bf643e3d990b1394b6b47b7fbc"},
+]
+[package.dependencies]
+filelock = "*"
+fsspec = "*"
+packaging = ">=20.9"
+pyyaml = ">=5.1"
+requests = "*"
+tqdm = ">=4.42.1"
+typing-extensions = ">=3.7.4.3"
+[package.extras]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+cli = ["InquirerPy (==0.3.4)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
+quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "gradio", "jedi", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "soundfile"]
+torch = ["torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+[[package]]
+name = "idna"
+version = "3.4"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
+    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+]
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+[[package]]
+name = "isort"
+version = "5.12.0"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"},
+    {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"},
+]
+[package.extras]
+colors = ["colorama (>=0.4.3)"]
+pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"]
+plugins = ["setuptools"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
+    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
+]
+[[package]]
+name = "mypy"
+version = "1.3.0"
+description = "Optional static typing for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"},
+    {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"},
+    {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"},
+    {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"},
+    {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"},
+    {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"},
+    {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"},
+    {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"},
+    {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"},
+    {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"},
+    {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"},
+    {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"},
+    {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"},
+    {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"},
+    {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"},
+    {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"},
+    {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"},
+    {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"},
+    {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"},
+    {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"},
+    {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"},
+    {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"},
+    {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"},
+    {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"},
+    {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"},
+    {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"},
+]
+[package.dependencies]
+mypy-extensions = ">=1.0.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = ">=3.10"
+[package.extras]
+dmypy = ["psutil (>=4.0)"]
+install-types = ["pip"]
+python2 = ["typed-ast (>=1.4.0,<2)"]
+reports = ["lxml"]
+[[package]]
+name = "mypy-extensions"
+version = "1.0.0"
+description = "Type system extensions for programs checked with the mypy type checker."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
+    {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
+]
+[[package]]
+name = "packaging"
+version = "23.1"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
+    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+]
+[[package]]
+name = "pathspec"
+version = "0.11.1"
+description = "Utility library for gitignore style pattern matching of file paths."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"},
+    {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
+]
+[[package]]
+name = "platformdirs"
+version = "3.5.0"
+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "platformdirs-3.5.0-py3-none-any.whl", hash = "sha256:47692bc24c1958e8b0f13dd727307cff1db103fca36399f457da8e05f222fdc4"},
+    {file = "platformdirs-3.5.0.tar.gz", hash = "sha256:7954a68d0ba23558d753f73437c55f89027cf8f5108c19844d4b82e5af396335"},
+]
+[package.extras]
+docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+[[package]]
+name = "pycodestyle"
+version = "2.9.1"
+description = "Python style guide checker"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
+    {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
+]
+[[package]]
+name = "pyflakes"
+version = "2.5.0"
+description = "passive checker of Python programs"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
+    {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
+]
+[[package]]
+name = "pytest"
+version = "7.3.1"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+]
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+[package.extras]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+[[package]]
+name = "pytest-asyncio"
+version = "0.21.0"
+description = "Pytest support for asyncio"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"},
+    {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"},
+]
+[package.dependencies]
+pytest = ">=7.0.0"
+[package.extras]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
+testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
+[[package]]
+name = "pyyaml"
+version = "6.0"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+]
+[[package]]
+name = "requests"
+version = "2.30.0"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "requests-2.30.0-py3-none-any.whl", hash = "sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294"},
+    {file = "requests-2.30.0.tar.gz", hash = "sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4"},
+]
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+[[package]]
+name = "sniffio"
+version = "1.3.0"
+description = "Sniff out which async library your code is running under"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
+    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
+]
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+[[package]]
+name = "tqdm"
+version = "4.65.0"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"},
+    {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"},
+]
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+[[package]]
+name = "typing-extensions"
+version = "4.5.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"},
+    {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"},
+]
+[[package]]
+name = "urllib3"
+version = "2.0.2"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"},
+    {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"},
+]
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+[[package]]
+name = "websockets"
+version = "11.0.3"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac"},
+    {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d"},
+    {file = "websockets-11.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526"},
+    {file = "websockets-11.0.3-cp310-cp310-win32.whl", hash = "sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69"},
+    {file = "websockets-11.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd"},
+    {file = "websockets-11.0.3-cp311-cp311-win32.whl", hash = "sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c"},
+    {file = "websockets-11.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8"},
+    {file = "websockets-11.0.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af"},
+    {file = "websockets-11.0.3-cp37-cp37m-win32.whl", hash = "sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f"},
+    {file = "websockets-11.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788"},
+    {file = "websockets-11.0.3-cp38-cp38-win32.whl", hash = "sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74"},
+    {file = "websockets-11.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311"},
+    {file = "websockets-11.0.3-cp39-cp39-win32.whl", hash = "sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128"},
+    {file = "websockets-11.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602"},
+    {file = "websockets-11.0.3-py3-none-any.whl", hash = "sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6"},
+    {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"},
+]
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.8"
+content-hash = "ddf46ff1beecd493f52ac6e719fe23967c38927d0f729029cc33ca47663669d8"

client/poetry.toml ADDED Viewed

	@@ -0,0 +1 @@


1	+ virtualenvs.in-project = true

client/pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[tool.poetry]
+name = "h2ogpt-client"
+version = "0.1.0"
+description = ""
+authors = []
+readme = "README.md"
+packages = [{include = "h2ogpt_client"}]
+[tool.poetry.dependencies]
+python = "^3.8"
+gradio-client = "^0.2.2"
+[tool.poetry.group.test.dependencies]
+pytest = "^7.3.1"
+pytest-asyncio = "^0.21.0"
+[tool.poetry.group.dev.dependencies]
+mypy = "^1.3.0"
+black = "^23.3.0"
+flake8 = "5.0.4"
+isort = "^5.12.0"
+flake8-pyproject = "^1.2.3"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[tool.isort]
+profile = "black"
+py_version = "auto"
+[tool.flake8]
+max-line-length = 88
+[tool.mypy]
+python_version = "3.8"
+[tool.pytest.ini_options]
+pythonpath = "h2ogpt_client"

client/tests/__init__.py ADDED Viewed

File without changes

client/tests/test_client.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from h2ogpt_client import Client
+def create_client(server_url: str = "") -> Client:
+    server_url = server_url or os.getenv("H2OGPT_SERVER", "http://0.0.0.0:7860")
+    return Client(server_url)
+def test_text_completion():
+    launch_server()
+    client = create_client()
+    r = client.text_completion.create("Hello world")
+    assert r
+    print(r)
+async def test_text_completion_async():
+    launch_server()
+    client = create_client()
+    r = await client.text_completion.create_async("Hello world")
+    assert r
+    print(r)
+def test_chat_completion():
+    launch_server()
+    client = create_client()
+    chat_context = client.chat_completion.create()
+    chat1 = chat_context.chat("Hey!")
+    assert chat1["user"] == "Hey!"
+    assert chat1["gpt"]
+    chat2 = chat_context.chat("How are you?")
+    assert chat2["user"] == "How are you?"
+    assert chat2["gpt"]
+    chat3 = chat_context.chat("Have a good day")
+    assert chat3["user"] == "Have a good day"
+    assert chat3["gpt"]
+    chat_history = chat_context.chat_history()
+    assert chat_history == [chat1, chat2, chat3]
+    print(chat_history)
+def launch_server():
+    from generate import main
+    main(base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', chat=False,
+         stream_output=False, gradio=True, num_beams=1, block_gradio_exit=False)

client_test.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+Client test.
+Run server:
+python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b
+NOTE: For private models, add --use-auth_token=True
+NOTE: --infer_devices=True (default) must be used for multi-GPU in case see failures with cuda:x cuda:y mismatches.
+Currently, this will force model to be on a single GPU.
+Then run this client as:
+python client_test.py
+For HF spaces:
+HOST="https://h2oai-h2ogpt-chatbot.hf.space" python client_test.py
+Result:
+Loaded as API: https://h2oai-h2ogpt-chatbot.hf.space ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a large language model developed by LAION.', 'sources': ''}
+For demo:
+HOST="https://gpt.h2o.ai" python client_test.py
+Result:
+Loaded as API: https://gpt.h2o.ai ✔
+{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.', 'sources': ''}
+NOTE: Raw output from API for nochat case is a string of a python dict and will remain so if other entries are added to dict:
+{'response': "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", 'sources': ''}
+"""
+import ast
+import time
+import os
+import markdown  # pip install markdown
+import pytest
+from bs4 import BeautifulSoup  # pip install beautifulsoup4
+from enums import DocumentChoices
+debug = False
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+def get_client(serialize=True):
+    from gradio_client import Client
+    client = Client(os.getenv('HOST', "http://localhost:7860"), serialize=serialize)
+    if debug:
+        print(client.view_api(all_endpoints=True))
+    return client
+def get_args(prompt, prompt_type, chat=False, stream_output=False,
+             max_new_tokens=50,
+             top_k_docs=3,
+             langchain_mode='Disabled', prompt_dict=''):
+    from collections import OrderedDict
+    kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
+                         iinput='',  # only for chat=True
+                         context='',
+                         # streaming output is supported, loops over and outputs each generation in streaming mode
+                         # but leave stream_output=False for simple input/output mode
+                         stream_output=stream_output,
+                         prompt_type=prompt_type,
+                         prompt_dict=prompt_dict,
+                         temperature=0.1,
+                         top_p=0.75,
+                         top_k=40,
+                         num_beams=1,
+                         max_new_tokens=max_new_tokens,
+                         min_new_tokens=0,
+                         early_stopping=False,
+                         max_time=20,
+                         repetition_penalty=1.0,
+                         num_return_sequences=1,
+                         do_sample=True,
+                         chat=chat,
+                         instruction_nochat=prompt if not chat else '',
+                         iinput_nochat='',  # only for chat=False
+                         langchain_mode=langchain_mode,
+                         top_k_docs=top_k_docs,
+                         chunk=True,
+                         chunk_size=512,
+                         document_choice=[DocumentChoices.All_Relevant.name],
+                         )
+    from generate import eval_func_param_names
+    assert len(set(eval_func_param_names).difference(set(list(kwargs.keys())))) == 0
+    if chat:
+        # add chatbot output on end.  Assumes serialize=False
+        kwargs.update(dict(chatbot=[]))
+    return kwargs, list(kwargs.values())
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_basic(prompt_type='human_bot'):
+    return run_client_nochat(prompt='Who are you?', prompt_type=prompt_type, max_new_tokens=50)
+def run_client_nochat(prompt, prompt_type, max_new_tokens):
+    kwargs, args = get_args(prompt, prompt_type, chat=False, max_new_tokens=max_new_tokens)
+    api_name = '/submit_nochat'
+    client = get_client(serialize=True)
+    res = client.predict(
+        *tuple(args),
+        api_name=api_name,
+    )
+    print("Raw client result: %s" % res, flush=True)
+    res_dict = dict(prompt=kwargs['instruction_nochat'], iinput=kwargs['iinput_nochat'],
+                    response=md_to_text(res))
+    print(res_dict)
+    return res_dict, client
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_basic_api(prompt_type='human_bot'):
+    return run_client_nochat_api(prompt='Who are you?', prompt_type=prompt_type, max_new_tokens=50)
+def run_client_nochat_api(prompt, prompt_type, max_new_tokens):
+    kwargs, args = get_args(prompt, prompt_type, chat=False, max_new_tokens=max_new_tokens)
+    api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
+    client = get_client(serialize=True)
+    res = client.predict(
+        str(dict(kwargs)),
+        api_name=api_name,
+    )
+    print("Raw client result: %s" % res, flush=True)
+    res_dict = dict(prompt=kwargs['instruction_nochat'], iinput=kwargs['iinput_nochat'],
+                    response=md_to_text(ast.literal_eval(res)['response']),
+                    sources=ast.literal_eval(res)['sources'])
+    print(res_dict)
+    return res_dict, client
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_basic_api_lean(prompt_type='human_bot'):
+    return run_client_nochat_api_lean(prompt='Who are you?', prompt_type=prompt_type, max_new_tokens=50)
+def run_client_nochat_api_lean(prompt, prompt_type, max_new_tokens):
+    kwargs = dict(instruction_nochat=prompt)
+    api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
+    client = get_client(serialize=True)
+    res = client.predict(
+        str(dict(kwargs)),
+        api_name=api_name,
+    )
+    print("Raw client result: %s" % res, flush=True)
+    res_dict = dict(prompt=kwargs['instruction_nochat'],
+                    response=md_to_text(ast.literal_eval(res)['response']),
+                    sources=ast.literal_eval(res)['sources'])
+    print(res_dict)
+    return res_dict, client
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_basic_api_lean_morestuff(prompt_type='human_bot'):
+    return run_client_nochat_api_lean_morestuff(prompt='Who are you?', prompt_type=prompt_type, max_new_tokens=50)
+def run_client_nochat_api_lean_morestuff(prompt, prompt_type='human_bot', max_new_tokens=512):
+    kwargs = dict(
+        instruction='',
+        iinput='',
+        context='',
+        stream_output=False,
+        prompt_type=prompt_type,
+        temperature=0.1,
+        top_p=0.75,
+        top_k=40,
+        num_beams=1,
+        max_new_tokens=256,
+        min_new_tokens=0,
+        early_stopping=False,
+        max_time=20,
+        repetition_penalty=1.0,
+        num_return_sequences=1,
+        do_sample=True,
+        chat=False,
+        instruction_nochat=prompt,
+        iinput_nochat='',
+        langchain_mode='Disabled',
+        top_k_docs=4,
+        document_choice=['All'],
+    )
+    api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
+    client = get_client(serialize=True)
+    res = client.predict(
+        str(dict(kwargs)),
+        api_name=api_name,
+    )
+    print("Raw client result: %s" % res, flush=True)
+    res_dict = dict(prompt=kwargs['instruction_nochat'],
+                    response=md_to_text(ast.literal_eval(res)['response']),
+                    sources=ast.literal_eval(res)['sources'])
+    print(res_dict)
+    return res_dict, client
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_chat(prompt_type='human_bot'):
+    return run_client_chat(prompt='Who are you?', prompt_type=prompt_type, stream_output=False, max_new_tokens=50,
+                           langchain_mode='Disabled')
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_chat_stream(prompt_type='human_bot'):
+    return run_client_chat(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
+                           stream_output=True, max_new_tokens=512,
+                           langchain_mode='Disabled')
+def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode, prompt_dict=None):
+    client = get_client(serialize=False)
+    kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output,
+                            max_new_tokens=max_new_tokens, langchain_mode=langchain_mode,
+                            prompt_dict=prompt_dict)
+    return run_client(client, prompt, args, kwargs)
+def run_client(client, prompt, args, kwargs, do_md_to_text=True, verbose=False):
+    assert kwargs['chat'], "Chat mode only"
+    res = client.predict(*tuple(args), api_name='/instruction')
+    args[-1] += [res[-1]]
+    res_dict = kwargs
+    res_dict['prompt'] = prompt
+    if not kwargs['stream_output']:
+        res = client.predict(*tuple(args), api_name='/instruction_bot')
+        res_dict['response'] = res[0][-1][1]
+        print(md_to_text(res_dict['response'], do_md_to_text=do_md_to_text))
+        return res_dict, client
+    else:
+        job = client.submit(*tuple(args), api_name='/instruction_bot')
+        res1 = ''
+        while not job.done():
+            outputs_list = job.communicator.job.outputs
+            if outputs_list:
+                res = job.communicator.job.outputs[-1]
+                res1 = res[0][-1][-1]
+                res1 = md_to_text(res1, do_md_to_text=do_md_to_text)
+                print(res1)
+            time.sleep(0.1)
+        full_outputs = job.outputs()
+        if verbose:
+            print('job.outputs: %s' % str(full_outputs))
+        # ensure get ending to avoid race
+        # -1 means last response if streaming
+        # 0 means get text_output, ignore exception_text
+        # 0 means get list within text_output that looks like [[prompt], [answer]]
+        # 1 means get bot answer, so will have last bot answer
+        res_dict['response'] = md_to_text(full_outputs[-1][0][0][1], do_md_to_text=do_md_to_text)
+        return res_dict, client
+@pytest.mark.skip(reason="For manual use against some server, no server launched")
+def test_client_nochat_stream(prompt_type='human_bot'):
+    return run_client_nochat_gen(prompt="Tell a very long kid's story about birds.", prompt_type=prompt_type,
+                                 stream_output=True, max_new_tokens=512,
+                                 langchain_mode='Disabled')
+def run_client_nochat_gen(prompt, prompt_type, stream_output, max_new_tokens, langchain_mode):
+    client = get_client(serialize=False)
+    kwargs, args = get_args(prompt, prompt_type, chat=False, stream_output=stream_output,
+                            max_new_tokens=max_new_tokens, langchain_mode=langchain_mode)
+    return run_client_gen(client, prompt, args, kwargs)
+def run_client_gen(client, prompt, args, kwargs, do_md_to_text=True, verbose=False):
+    res_dict = kwargs
+    res_dict['prompt'] = prompt
+    if not kwargs['stream_output']:
+        res = client.predict(str(dict(kwargs)), api_name='/submit_nochat_api')
+        res_dict['response'] = res[0]
+        print(md_to_text(res_dict['response'], do_md_to_text=do_md_to_text))
+        return res_dict, client
+    else:
+        job = client.submit(str(dict(kwargs)), api_name='/submit_nochat_api')
+        while not job.done():
+            outputs_list = job.communicator.job.outputs
+            if outputs_list:
+                res = job.communicator.job.outputs[-1]
+                res_dict = ast.literal_eval(res)
+                print('Stream: %s' % res_dict['response'])
+            time.sleep(0.1)
+        res_list = job.outputs()
+        assert len(res_list) > 0, "No response, check server"
+        res = res_list[-1]
+        res_dict = ast.literal_eval(res)
+        print('Final: %s' % res_dict['response'])
+        return res_dict, client
+def md_to_text(md, do_md_to_text=True):
+    if not do_md_to_text:
+        return md
+    assert md is not None, "Markdown is None"
+    html = markdown.markdown(md)
+    soup = BeautifulSoup(html, features='html.parser')
+    return soup.get_text()
+def run_client_many(prompt_type='human_bot'):
+    ret1, _ = test_client_chat(prompt_type=prompt_type)
+    ret2, _ = test_client_chat_stream(prompt_type=prompt_type)
+    ret3, _ = test_client_nochat_stream(prompt_type=prompt_type)
+    ret4, _ = test_client_basic(prompt_type=prompt_type)
+    ret5, _ = test_client_basic_api(prompt_type=prompt_type)
+    ret6, _ = test_client_basic_api_lean(prompt_type=prompt_type)
+    ret7, _ = test_client_basic_api_lean_morestuff(prompt_type=prompt_type)
+    return ret1, ret2, ret3, ret4, ret5, ret6, ret7
+if __name__ == '__main__':
+    run_client_many()

create_data.py ADDED Viewed

	@@ -0,0 +1,1809 @@

+"""
+Dataset creation tools.
+Keep to-level imports clean of non-trivial imports for specific tools,
+because this file is imported for various purposes
+"""
+import ast
+import concurrent.futures
+import contextlib
+import hashlib
+import json
+import os
+import shutil
+import signal
+import sys
+import traceback
+from concurrent.futures import ProcessPoolExecutor
+import psutil
+import pytest
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from utils import flatten_list, remove
+def parse_rst_file(filepath):
+    with open(filepath, 'r') as f:
+        input_data = f.read()
+    settings_overrides = {'initial_header_level': 2}
+    from docutils import core
+    document = core.publish_doctree(
+        source=input_data,
+        source_path=filepath,
+        settings_overrides=settings_overrides,
+    )
+    qa_pairs = []
+    current_section = None
+    current_question = ""
+    current_answer = ""
+    for node in document.traverse():
+        if node.__class__.__name__ == 'section':
+            current_section = ""
+        elif current_section is not None:
+            if node.__class__.__name__ == 'Text':
+                if node.astext()[-1] == "?":
+                    if current_question:
+                        qa_pairs.append((current_question, current_answer))
+                    current_question = node.astext()
+                    current_answer = ""
+                else:
+                    current_answer += node.astext()
+    if current_answer:
+        qa_pairs.append((current_question, current_answer))
+    return {k: v for k, v in qa_pairs}
+def test_scrape_dai_docs():
+    home = os.path.expanduser('~')
+    file = os.path.join(home, 'h2oai/docs/faq.rst')
+    qa_pairs = parse_rst_file(file)
+    prompt_type = 'human_bot'
+    from prompter import prompt_types
+    assert prompt_type in prompt_types
+    save_thing = [{"instruction": k, "output": v, 'prompt_type': prompt_type} for k, v in qa_pairs.items()]
+    output_file = "dai_faq.json"
+    with open(output_file, "wt") as f:
+        f.write(json.dumps(save_thing, indent=2))
+def test_scrape_dai_docs_all():
+    """
+    pytest create_data.py::test_scrape_dai_docs_all
+    """
+    import glob
+    import nltk
+    nltk.download('punkt')
+    dd = {}
+    np.random.seed(1234)
+    home = os.path.expanduser('~')
+    files = list(glob.glob(os.path.join(home, "h2oai/docs/**/*rst")))
+    np.random.shuffle(files)
+    val_count = int(0.05 * len(files))
+    train_files = files[val_count:]
+    valid_files = files[:val_count]
+    things = [
+        ("dai_docs.train.json", train_files),
+        ("dai_docs.valid.json", valid_files)
+    ]
+    for LEN in [100, 200, 500]:
+        for output_file, ff in things:
+            if output_file not in dd:
+                dd[output_file] = []
+            for f in ff:
+                with open(f) as input:
+                    blob = input.read()
+                    blob = blob.replace("~~", "")
+                    blob = blob.replace("==", "")
+                    blob = blob.replace("''", "")
+                    blob = blob.replace("--", "")
+                    blob = blob.replace("**", "")
+                    dd[output_file].extend(get_sentences(blob, length=LEN))
+    for output_file, _ in things:
+        save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in dd[output_file]]
+        with open(output_file, "wt") as f:
+            f.write(json.dumps(save_thing, indent=2))
+def get_sentences(blob, length):
+    """
+    break-up input text into sentences and then output list of sentences of about length in size
+    :param blob:
+    :param length:
+    :return:
+    """
+    import nltk
+    nltk.download('punkt')
+    from nltk.tokenize import sent_tokenize
+    sentences = sent_tokenize(blob)
+    my_sentences = []
+    my_string = ""
+    for sentence in sentences:
+        if len(my_string) + len(sentence) <= length:
+            if my_string:
+                my_string += " " + sentence
+            else:
+                my_string = sentence
+        else:
+            my_sentences.append(my_string)
+            my_string = ""
+    return my_sentences or [my_string]
+def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False):
+    """
+    Only supported if have access to source code or HF token for HF spaces and from_hf=True
+    :param path:
+    :param dst:
+    :param from_hf:
+    :return:
+    """
+    home = os.path.expanduser('~')
+    if from_hf:
+        # assumes
+        from huggingface_hub import hf_hub_download
+        # True for case when locally already logged in with correct token, so don't have to set key
+        token = os.getenv('HUGGINGFACE_API_TOKEN', True)
+        path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.zip', token=token, repo_type='dataset')
+        path = 'h2oai'
+        import zipfile
+        with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
+            zip_ref.extractall(path)
+        path = os.path.join(path, 'docs/**/*')
+    if path is None:
+        if os.path.isdir(os.path.join(home, 'h2oai')):
+            path = os.path.join(home, "h2oai/docs/**/*")
+        else:
+            assert os.path.isdir(os.path.join(home, 'h2oai.superclean')), '%s does not exist' % path
+            path = os.path.join(home, "h2oai.superclean/docs/**/*")
+    import glob
+    files = list(glob.glob(path, recursive=True))
+    # pandoc can't find include files
+    remove(dst)
+    os.makedirs(dst)
+    # copy full tree, for absolute paths in rst
+    for fil in files:
+        if os.path.isfile(fil):
+            shutil.copy(fil, dst)
+    # hack for relative path
+    scorers_dir = os.path.join(dst, 'scorers')
+    makedirs(scorers_dir)
+    for fil in glob.glob(os.path.join(dst, '*.frag')):
+        shutil.copy(fil, scorers_dir)
+    return dst
+def rst_to_outputs(files, min_len=30, max_len=2048 // 2 - 30):
+    # account for sequence length (context window) including prompt and input and output
+    # os.system('pandoc -f rst -t plain ./expert_settings/nlp_settings.rst')
+    import pypandoc
+    basedir = os.path.abspath(os.getcwd())
+    outputs = []
+    for fil in files:
+        os.chdir(basedir)
+        os.chdir(os.path.dirname(fil))
+        fil = os.path.basename(fil)
+        print("Processing %s" % fil, flush=True)
+        # out_format can be one of: asciidoc, asciidoctor, beamer, biblatex, bibtex, commonmark, commonmark_x,
+        # context, csljson, docbook, docbook4, docbook5, docx, dokuwiki,
+        # dzslides, epub, epub2, epub3, fb2, gfm, haddock, html, html4, html5, icml,
+        # ipynb, jats, jats_archiving, jats_articleauthoring, jats_publishing, jira,
+        # json, latex, man,
+        # markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict,
+        # mediawiki, ms, muse, native, odt, opendocument, opml, org, pdf, plain, pptx,
+        # revealjs, rst, rtf, s5, slideous, slidy, tei, texinfo, textile, xwiki, zimwiki
+        out_format = 'plain'
+        # avoid extra new lines injected into text
+        extra_args = ['--wrap=preserve', '--resource path="%s" % dst']
+        plain_list = []
+        try:
+            # valid for expert settings
+            input_rst = pypandoc.convert_file(fil, 'rst')
+            input_list = input_rst.split('\n``')
+            for input_subrst in input_list:
+                input_plain = pypandoc.convert_text(input_subrst, format='rst', to='plain')
+                plain_list.append([input_plain, fil])
+        except Exception as e:
+            print("file exception: %s %s" % (fil, str(e)), flush=True)
+        if not plain_list:
+            # if failed to process as pieces of rst, then
+            output = pypandoc.convert_file(fil, out_format, extra_args=extra_args, format='rst')
+            outputs1 = get_sentences(output, length=max_len)
+            for oi, output in enumerate(outputs1):
+                output = output.replace('\n\n', '\n')
+                plain_list.append([output, fil])
+        outputs.extend(plain_list)
+    # report:
+    # [print(len(x)) for x in outputs]
+    # deal with blocks longer than context size (sequence length) of 2048
+    new_outputs = []
+    num_truncated = 0
+    num_orig = len(outputs)
+    for output, fil in outputs:
+        if len(output) < max_len:
+            new_outputs.append([output, fil])
+            continue
+        outputs1 = get_sentences(output, length=max_len)
+        for oi, output1 in enumerate(outputs1):
+            output1 = output1.replace('\n\n', '\n')
+            new_outputs.append([output1, fil])
+        num_truncated += 1
+    print('num_orig: %s num_truncated: %s' % (num_orig, num_truncated), flush=True)
+    new_outputs = [[k.strip(), fil] for k, fil in new_outputs if len(k.strip()) > min_len]
+    return new_outputs
+def test_scrape_dai_docs_all_pandoc():
+    """
+    pytest -s -v create_data.py::test_scrape_dai_docs_all_pandoc
+    :return:
+    """
+    dst = setup_dai_docs()
+    import glob
+    files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True))
+    basedir = os.path.abspath(os.getcwd())
+    new_outputs = rst_to_outputs(files)
+    os.chdir(basedir)
+    remove(dst)
+    save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in new_outputs]
+    output_file = "dai_docs.train_cleaned.json"
+    with open(output_file, "wt") as f:
+        f.write(json.dumps(save_thing, indent=2))
+def test_config_to_json():
+    """
+    Needs to run from Driverless AI source directory.
+    E.g. (base) jon@gpu:~/h2oai$ pytest -s -v /data/jon/h2ogpt/create_data.py::test_config_to_json ; cp config.json /data/jon/h2ogpt/
+    :return:
+    """
+    try:
+        # Arrange
+        import json
+        from h2oaicore.systemutils import config
+        toml_list = []
+        for k, v in config.get_meta_dict().items():
+            title = (v.title + ": ") if v.title else ''
+            comment = v.comment or ''
+            if not (title or comment):
+                continue
+            toml_list.extend(
+                [
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace(
+                            "\n", ""),
+                    },
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace(
+                            "\n", ""),
+                    },
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace(
+                            "\n", ""),
+                    } if title and comment else None,
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k.replace('_', ' ')}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{title}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Provide a short explanation of the expert setting {k}',
+                        'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Provide a detailed explanation of the expert setting {k}',
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                ]
+            )
+        toml_list = [x for x in toml_list if x]
+        with open("config.json", "wt") as f:
+            f.write(json.dumps(toml_list, indent=2))
+    except Exception as e:
+        print("Exception: %s" % str(e), flush=True)
+def copy_tree(src, dst, follow_symlink=False):
+    makedirs(dst, exist_ok=True)
+    for (path, dirs, files) in os.walk(src, followlinks=follow_symlink):
+        new_path = path.replace(src, dst)
+        makedirs(new_path, exist_ok=True)
+        for file in files:
+            filename = os.path.join(path, file)
+            new_filename = os.path.join(new_path, file)
+            # print("%s -> %s" % (filename, new_filename))
+            try:
+                atomic_copy(filename, new_filename)
+            except FileNotFoundError:
+                pass
+def atomic_move(src, dst):
+    try:
+        shutil.move(src, dst)
+    except (shutil.Error, FileExistsError):
+        pass
+    remove(src)
+def atomic_copy(src=None, dst=None, with_permissions=True):
+    if os.path.isfile(dst):
+        return
+    import uuid
+    my_uuid = uuid.uuid4()
+    dst_tmp = dst + str(my_uuid)
+    makedirs(os.path.dirname(dst), exist_ok=True)
+    if with_permissions:
+        shutil.copy(src, dst_tmp)
+    else:
+        shutil.copyfile(src, dst_tmp)
+    atomic_move(dst_tmp, dst)
+    remove(dst_tmp)
+def makedirs(path, exist_ok=True):
+    """
+    Avoid some inefficiency in os.makedirs()
+    :param path:
+    :param exist_ok:
+    :return:
+    """
+    if os.path.isdir(path) and os.path.exists(path):
+        assert exist_ok, "Path already exists"
+        return path
+    os.makedirs(path, exist_ok=exist_ok)
+## Download from https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_unfiltered_cleaned_split.json
+## Turn into simple instruct prompt type. No context/previous conversations.
+def test_prep_instruct_vicuna():
+    from datasets import load_dataset
+    filename = 'ShareGPT_unfiltered_cleaned_split.json'
+    if not os.path.exists(filename):
+        os.system(
+            'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
+    data = load_dataset("json", data_files={"train": filename})["train"]
+    training_rows = []
+    for i in range(data.num_rows):
+        conversations = data[i]['conversations']
+        assert isinstance(conversations, list), conversations
+        convo = ""
+        for j, conv in enumerate(conversations):
+            # Get ready for generate.py prompt_type=human_bot
+            # But train with prompt_type=plain
+            if conv['from'] == 'human':
+                FROM = '<human>: '
+            elif conv['from'] == 'gpt':
+                FROM = '<bot>: '
+            convo += f"{FROM}" + conv['value'] + "\n"
+        if convo:
+            training_rows.append(dict(input=convo))
+    with open(filename + ".generate_human_bot.train_plain.json", "wt") as f:
+        f.write(json.dumps(training_rows, indent=2))
+POSTFIX = ".generate_human_bot.train_plain.json"
+# https://bair.berkeley.edu/blog/2023/04/03/koala/
+OIG_DATASETS = [
+    "unified_chip2.jsonl",
+    "unified_grade_school_math_instructions.jsonl",
+    "unified_poetry_2_song.jsonl",
+    "unified_plot_screenplay_books_dialog.jsonl",
+]
+# hub issue: https://huggingface.co/datasets/laion/OIG/discussions/4
+ALL_OIG_DATASETS = ['unified_abstract_infill.jsonl',
+                    'unified_basic.jsonl',
+                    'unified_canadian_parliament.jsonl',
+                    'unified_chip2.jsonl',
+                    'unified_conv_finqa.jsonl',
+                    'unified_cuad.jsonl',
+                    'unified_essays.jsonl',
+                    'unified_flan.jsonl.gz',
+                    'unified_grade_school_math_instructions.jsonl',
+                    'unified_hc3_human.jsonl',
+                    'unified_image_prompts_instructions.jsonl',
+                    'unified_joke_explanations.jsonl',
+                    'unified_mathqa_flanv2_kojma_cot.jsonl',
+                    'unified_merged_code_xp3.jsonl',
+                    'unified_multi_news.jsonl',
+                    'unified_multi_sum.jsonl',
+                    'unified_ni.jsonl.gz',
+                    'unified_nq.jsonl',
+                    'unified_openai_summarize_tldr.jsonl',
+                    'unified_oscar_en_sample_dialog.jsonl',
+                    'unified_p3.jsonl.gz',
+                    'unified_plot_screenplay_books_dialog.jsonl',
+                    'unified_poetry_2_song.jsonl',
+                    'unified_poetry_instructions.jsonl',
+                    'unified_rallio_safety_and_prosocial.jsonl',
+                    'unified_rallio_soda_upgraded_2048.jsonl',
+                    'unified_soda_dialog.jsonl',
+                    'unified_sqlv1.jsonl',
+                    'unified_sqlv2.jsonl',
+                    'unified_squad_v2.jsonl',
+                    'unified_squad_v2_more_neg.jsonl',
+                    'unified_ul2_plus_oscar_en_sample_dialog.jsonl',
+                    'unified_unifiedskg_instructions.jsonl',
+                    'unified_unnatural_instructions.jsonl',
+                    'unified_xp3_sample.jsonl']
+useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet',
+                    'unified_chip2.jsonl.parquet',
+                    'unified_cuad.jsonl.parquet',
+                    'unified_essays.jsonl.parquet',
+                    'unified_flan.jsonl.gz.parquet',
+                    'unified_grade_school_math_instructions.jsonl.parquet',
+                    'unified_hc3_human.jsonl.parquet',
+                    'unified_mathqa_flanv2_kojma_cot.jsonl.parquet',
+                    'unified_merged_code_xp3.jsonl.parquet',
+                    'unified_multi_news.jsonl.parquet',
+                    # 'unified_multi_sum.jsonl.parquet'
+                    'unified_ni.jsonl.gz.parquet',
+                    'unified_openai_summarize_tldr.jsonl.parquet',
+                    # 'unified_oscar_en_sample_dialog.jsonl.parquet', # create text containing these N words, not specific
+                    'unified_plot_screenplay_books_dialog.jsonl.parquet',
+                    'unified_soda_dialog.jsonl.parquet',
+                    'unified_unnatural_instructions.jsonl.parquet',
+                    ]
+@pytest.mark.parametrize("filename", OIG_DATASETS)
+def test_get_small_sample_oig_data(filename):
+    if not os.path.exists(filename):
+        os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename)
+    import json
+    rows = []
+    with open(filename, "r") as f:
+        for line in f.readlines():
+            row = json.loads(line)
+            rows.append(dict(input=row["text"]))
+    with open(filename + POSTFIX, "w") as f:
+        f.write(json.dumps(rows, indent=2))
+@pytest.mark.parametrize("filename", ALL_OIG_DATASETS)
+def test_download_useful_data_as_parquet(filename):
+    dest_file = filename + '.parquet'
+    if dest_file not in useful_oig_files:
+        pytest.skip('file declared not useful')
+    if not os.path.exists(filename):
+        os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename)
+    if not os.path.exists(dest_file):
+        df = pd.read_json(path_or_buf=filename, lines=True)
+        df.to_parquet(dest_file, index=False)
+def test_merge_shuffle_small_sample_oig_data():
+    np.random.seed(1234)
+    rows = []
+    for filename in OIG_DATASETS:
+        with open(filename + POSTFIX, "r") as f:
+            rows.extend(json.loads(f.read()))
+    np.random.shuffle(rows)
+    with open("merged_shuffled_OIG_%s.json" % hashlib.sha256(str(OIG_DATASETS).encode()).hexdigest()[:10], "w") as f:
+        f.write(json.dumps(rows, indent=2))
+def test_join_jsons():
+    files = ['config.json'] * 1 + \
+            ['dai_docs.train_cleaned.json'] * 2 + \
+            ['dai_faq.json'] * 3
+    print(files)
+    lst = []
+    [lst.extend(json.load(open(fil, 'rt'))) for fil in files]
+    print(len(lst))
+    json.dump(lst, open("merged.json", "wt"), indent=2)
+@pytest.mark.parametrize("filename", ['Anthropic/hh-rlhf'])
+def test_make_rlhf_good_data(filename):
+    from datasets import load_dataset
+    rows = load_dataset(filename)["train"]["chosen"]
+    new_rows = []
+    for row in rows:
+        if row[:2] == "\n\n":
+            row = row[2:]
+        row = row.replace("Human: ", "<human>: ")
+        row = row.replace("Assistant: ", "<bot>: ")
+        new_rows.append(dict(input=row))
+    with open(filename.replace("/", "_") + POSTFIX, "w") as f:
+        f.write(json.dumps(new_rows, indent=2))
+def test_show_prompts():
+    files = ['config.json'] * 1 + \
+            ['dai_docs.train_cleaned.json'] * 1 + \
+            ['dai_faq.json'] * 1
+    file_points = [json.load(open(fil, 'rt')) for fil in files]
+    from prompter import generate_prompt
+    for data_points in file_points:
+        for data_point in data_points:
+            print(generate_prompt(data_point, 'plain', '', False, False, False)[0])
+def test_get_open_datasets():
+    # HF changed things so don't get raw list of all datasets, so not have to filter, but can't do negative filter
+    open_tags = ['license:Apache License 2.0',
+                 'license:mit',
+                 'license:apache',
+                 'license:apache2',
+                 'license:apache-2.0',
+                 'license:bsd',
+                 'license:bsd-2-clause',
+                 'license:bsd-3-clause',
+                 'license:bsd-3-clause-clear',
+                 'license:lgpl-2.1',
+                 'license:lgpl-3.0',
+                 'license:lgpl-lr',
+                 'license:lgpl',
+                 'license:openrail++',
+                 'license:openrail',
+                 'license:bigscience-bloom-rail-1.0',
+                 # 'license:agpl-3.0',
+                 'license:other',
+                 'license:unknown',
+                 # 'license:mpl-2.0',     # ok, but would have to include original copyright, license, source, copies in distribution
+                 # Attribution required:
+                 'license:odc-by',
+                 'license:cc-by-4.0',
+                 'license:cc-by-3.0',
+                 'license:cc-by-2.0',
+                 'license:cc-by-2.5',
+                 # 'license:cc-by-sa-4.0',  # would require same license
+                 'license:odbl',
+                 'license:pddl',
+                 'license:ms-pl',
+                 'license:zlib',
+                 ]
+    # bad license: cc-by-nc-4.0
+    from huggingface_hub import list_datasets
+    datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags])
+    datasets += [x for x in list_datasets(author='openai')]
+    # check all:
+    all_license_tags = set(flatten_list([[y for y in x.tags if 'license' in y] for x in datasets]))
+    print(len(all_license_tags))
+    open_datasets = [x for x in datasets if any([y in x.tags for y in open_tags]) or 'license:' not in str(x.tags)]
+    print('open_datasets', len(open_datasets))
+    all_task_tags = set(flatten_list([[y for y in x.tags if 'task' in y] for x in open_datasets]))
+    print('all_task_tags', len(all_task_tags))
+    excluded_tags = ['image', 'hate', 'tabular', 'table-', 'classification', 'retrieval',
+                     'translation', 'identification', 'object', 'mask', 'to-text',
+                     'face-detection', 'audio', 'voice', 'reinforcement', 'depth-est',
+                     'forecasting', 'parsing', 'visual', 'speech', 'multiple-choice',
+                     'slot-filling', 'irds/argsme', '-scoring', 'other', 'graph-ml',
+                     'feature-extraction', 'keyword-spotting',
+                     'coreference-resolution', 'segmentation',
+                     'word-sense-disambiguation',
+                     'lemmatization']
+    task_tags = [x.replace('task_categories:', '').replace('task_ids:', '')
+                 for x in all_task_tags if not any([y in x for y in
+                                                    excluded_tags])]
+    print('task_tags', len(task_tags))
+    # str(x.tags) to catch any pattern match to anything in list
+    open_tasked_datasets = [x for x in open_datasets if
+                            any([y in str([x for x in x.tags if 'task' in x]) for y in task_tags]) and
+                            not any([y in str([x for x in x.tags if 'task' in x]) for y in excluded_tags]) or
+                            'task_categories' not in str(x.tags) and 'task_ids' not in str(x.tags)]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.disabled]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.gated]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.private]
+    print('open_tasked_datasets', len(open_tasked_datasets))
+    sizes = list(set(flatten_list([[(y, x.id) for y in x.tags if 'size' in y] for x in open_tasked_datasets])))
+    languages = list(set(flatten_list([[(y, x.id) for y in x.tags if 'language:' in y] for x in open_tasked_datasets])))
+    open_english_tasked_datasets = [x for x in open_tasked_datasets if
+                                    'language:' not in str(x.tags) or
+                                    'language:en' in str(x.tags)]
+    small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if
+                                          'n<1K' in str(x.tags) or
+                                          '1K<n<10K' in str(x.tags) or
+                                          '1K0<n<100K' in str(x.tags) or
+                                          '100K<n<1M' in str(x.tags) or
+                                          'size_category' not in str(x.tags)
+                                          ]
+    # 'aeslc' : email_body, subject -> summarization?
+    # load_dataset(open_tasked_datasets[0].id).data['train'].to_pandas()
+    ids = [x.id for x in small_open_english_tasked_datasets]
+    # sanity checks
+    # https://bair.berkeley.edu/blog/2023/04/03/koala/
+    assert 'alespalla/chatbot_instruction_prompts' in ids
+    assert 'laion/OIG' in ids
+    assert 'openai/webgpt_comparisons' in ids
+    assert 'openai/summarize_from_feedback' in ids
+    assert 'Anthropic/hh-rlhf' in ids
+    # useful but not allowed for commercial purposes:
+    # https://huggingface.co/datasets/squad
+    print('open_english_tasked_datasets: ', ids, flush=True)
+    exclude_ids = ['allenai/nllb',  # translation only
+                   'hf-internal-testing/fixtures_image_utils',  # testing
+                   'allenai/c4',  # search-url
+                   'agemagician/uniref50',  # unknown
+                   'huggingface-course/documentation-images',  # images
+                   'smilegate-ai/kor_unsmile',  # korean
+                   'MohamedRashad/ChatGPT-prompts',  # ChatGPT/LearnGPT/https://www.emergentmind.com/
+                   'humarin/chatgpt-paraphrases',  # Paraphrase using ChatGPT
+                   'Jeska/vaccinchat',  # not useful
+                   'alespalla/chatbot_instruction_prompts',  # mixes alpaca
+                   'allenai/prosocial-dialog',
+                   # already exlucded, but wrongly in other datasets that say more permissive license
+                   'AlekseyKorshuk/persona-chat',  # low quality
+                   'bavard/personachat_truecased',  # low quality
+                   'adamlin/daily_dialog',  # medium quality conversations
+                   'adamlin/FewShotWoz',  # low quality
+                   'benjaminbeilharz/better_daily_dialog',  # low quality
+                   'benjaminbeilharz/daily_dialog_w_turn_templates',  # low
+                   'benjaminbeilharz/empathetic_dialogues_for_lm',  # low
+                   'GEM-submissions/GEM__bart_base_schema_guided_dialog__1645547915',  # NA
+                   'ia-bentebib/conv_ai_2_fr',  # low fr
+                   'ia-bentebib/daily_dialog_fr',  # low fr
+                   'ia-bentebib/dialog_re_fr',  # low fr
+                   'ia-bentebib/empathetic_dialogues_fr',  # low fr
+                   'roskoN/dailydialog',  # low
+                   'VadorMazer/skyrimdialogstest',  # low
+                   'bigbio/med_qa',  # med specific Q/A
+                   'biu-nlp/qa_srl2018',  # low quality Q/A
+                   'biu-nlp/qa_discourse',  # low quality Q/A
+                   'iarfmoose/qa_evaluator',  # low quality Q/A
+                   'jeopardy',  # low quality Q/A -- no reasoning
+                   'narrativeqa',  # low quality Q/A
+                   'nomic-ai/gpt4all_prompt_generations',  # bad license
+                   'nomic-ai/gpt4all_prompt_generations_with_p3',  # bad license
+                   'HuggingFaceH4/alpaca',  # bad license
+                   'tatsu-lab/alpaca',  # ToS breaking
+                   'yahma/alpaca-cleaned',  # ToS breaking
+                   'Hello-SimpleAI/HC3',  # bad license
+                   'glue',  # no reasoning QA
+                   'sahil2801/CodeAlpaca-20k',  # bad license
+                   'Short-Answer-Feedback/saf_communication_networks_english',  # long Q, medium A
+                   ]
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if x.id not in exclude_ids]
+    # some ids clearly speech related
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id]
+    # HF testing
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
+                                          'hf-internal-testing' not in x.id]
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
+                                          'chinese' not in x.id]
+    sorted_small_open_english_tasked_datasets = sorted([(x.downloads, x) for x in small_open_english_tasked_datasets],
+                                                       key=lambda x: x[0], reverse=True)
+    # NOTES:
+    # Run like pytest -s -v create_data.py::test_get_open_datasets &> getdata9.log
+    # See what needs config passed and add:
+    # grep 'load_dataset(' getdata9.log|grep -v data_id|less -S
+    # grep "pip install" getdata9.log
+    # NOTE: Some datasets have default config, but others are there.  Don't know how to access them.
+    """
+    https://huggingface.co/datasets/wikihow/blob/main/wikihow.py
+    https://github.com/mahnazkoupaee/WikiHow-Dataset
+    https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358
+    https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358
+    """
+    """
+    # some ambiguous or non-commercial datasets
+    https://github.com/PhoebusSi/alpaca-CoT
+    """
+    timeout = 3 * 60
+    # laion/OIG takes longer
+    for num_downloads, dataset in sorted_small_open_english_tasked_datasets:
+        data_id = dataset.id
+        func = do_one
+        args = (data_id, num_downloads)
+        kwargs = {}
+        with ProcessPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(func, *args, **kwargs)
+            try:
+                future.result(timeout=timeout)
+            except concurrent.futures.TimeoutError:
+                print("\n\ndata_id %s timeout\n\n" % data_id, flush=True)
+            for child in psutil.Process(os.getpid()).children(recursive=True):
+                os.kill(child.pid, signal.SIGINT)
+                os.kill(child.pid, signal.SIGTERM)
+                os.kill(child.pid, signal.SIGKILL)
+def do_one(data_id, num_downloads):
+    from datasets import load_dataset
+    out_file = "data_%s.parquet" % str(data_id.replace('/', '_'))
+    if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024 ** 3:
+        return
+    try:
+        print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True)
+        avail_list = None
+        try:
+            data = load_dataset(data_id, 'foobar')
+        except Exception as e:
+            if 'Available: ' in str(e):
+                avail_list = ast.literal_eval(str(e).split('Available:')[1].strip())
+            else:
+                avail_list = None
+        if avail_list is None:
+            avail_list = [None]
+        print("%s avail_list: %s" % (data_id, avail_list), flush=True)
+        for name in avail_list:
+            out_file = "data_%s_%s.parquet" % (str(data_id.replace('/', '_')), str(name))
+            if os.path.isfile(out_file):
+                continue
+            data = load_dataset(data_id, name)
+            column_names_dict = data.column_names
+            column_names = column_names_dict[list(column_names_dict.keys())[0]]
+            print("Processing data_id %s num_downloads: %s columns: %s" % (data_id, num_downloads, column_names),
+                  flush=True)
+            data_dict = data.data
+            col_dict = data.num_columns
+            first_col = list(col_dict.keys())[0]
+            if 'train' in data_dict:
+                df = data['train'].to_pandas()
+            else:
+                df = data[first_col].to_pandas()
+            # csv has issues with escaping chars, even for datasets I know I want
+            df.to_parquet(out_file, index=False)
+    except Exception as e:
+        t, v, tb = sys.exc_info()
+        ex = ''.join(traceback.format_exception(t, v, tb))
+        print("Exception: %s %s" % (data_id, ex), flush=True)
+def test_otherlic():
+    from huggingface_hub import list_datasets
+    lic = ['license:odc-by',
+           'license:cc-by-4.0',
+           'license:cc-by-3.0',
+           'license:cc-by-2.0',
+           'license:cc-by-2.5',
+           'license:cc-by-sa-4.0',
+           'license:odbl',
+           'license:pddl',
+           'license:ms-pl',
+           'license:zlib',
+           ]
+    datasets = flatten_list([[x for x in list_datasets(filter=y) if 'translation' not in str(x.tags)] for y in lic])
+    print(len(datasets))
+# These useful datasets are determined based upon data sample, column types, and uniqueness compared to larger datasets like Pile
+# grep columns getdata13.log|grep -v "\['image'\]"|sort|uniq|grep -v tokens|grep -v "'image'"|grep -v embedding|grep dialog
+useful = ['Dahoas/instruct-human-assistant-prompt',
+          'Dahoas/first-instruct-human-assistant-prompt',
+          'knkarthick/dialogsum',  # summary of conversation
+          'McGill-NLP/FaithDial',  # medium quality
+          'Zaid/quac_expanded',  # medium quality context + QA
+          '0-hero/OIG-small-chip2',  # medium
+          'alistvt/coqa-flat',  # QA medium
+          'AnonymousSub/MedQuAD_47441_Question_Answer_Pairs',  # QA medium
+          'Anthropic/hh-rlhf',  # high quality  # similar to Dahoas/full-hh-rlhf
+          'arjunth2001/online_privacy_qna',  # good quality QA
+          'Dahoas/instruct_helpful_preferences',  # medium quality instruct
+          'Dahoas/rl-prompt-dataset',  # medium chat
+          'Dahoas/rm-static',  # medium chat
+          'Dahoas/static-hh',  # medium chat  # HuggingFaceH4/self_instruct
+          'Dahoas/synthetic-instruct-gptj-pairwise',  # medium chat
+          'eli5',  # QA if prompt ELI5
+          'gsm8k',  # QA (various)
+          'guanaco/guanaco',  # prompt/response
+          'kastan/rlhf-qa-comparisons',  # good QA
+          'kastan/rlhf-qa-conditional-generation-v2',  # prompt answer
+          'OllieStanley/humaneval-mbpp-codegen-qa',  # code QA, but started from words, so better than other code QA
+          'OllieStanley/humaneval-mbpp-testgen-qa',  # code QA
+          'Graverman/Instruct-to-Code',  # code QA
+          'openai/summarize_from_feedback',  # summarize
+          'relbert/analogy_questions',  # analogy QA
+          'yitingxie/rlhf-reward-datasets',  # prompt, chosen, rejected.
+          'yizhongw/self_instruct',  # instruct (super natural & instruct)
+          'HuggingFaceH4/asss',  # QA, big A
+          'kastan/rlhf-qa-conditional-generation-v2',  # QA
+          'cosmos_qa',  # context QA
+          'vishal-burman/c4-faqs',  # QA but not so much reasoning, but alot of text
+          'squadshifts',  # QA from context
+          'hotpot_qa',  # QA from context
+          'adversarial_qa',  # QA from context
+          'allenai/soda',  # dialog -> narrative/summary
+          'squad_v2',  # context QA
+          'squadshifts',  # context QA
+          'dferndz/cSQuAD1',  # context QA
+          'dferndz/cSQuAD2',  # context QA
+          'din0s/msmarco-nlgen',  # context QA
+          'domenicrosati/TruthfulQA',  # common sense truthful QA -- trivia but good trivia
+          'hotpot_qa',  # context, QA
+          'HuggingFaceH4/self-instruct-eval',  # instruct QA, medium quality, some language reasoning
+          'kastan/EE_QA_for_RLHF',  # context QA
+          'KK04/LogicInference_OA',  # instruction logical QA
+          'lmqg/qa_squadshifts_synthetic',  # context QA
+          'lmqg/qg_squad',  # context QA
+          'lmqg/qg_squadshifts',  # context QA
+          'lmqg/qg_subjqa',  # context QA
+          'pszemraj/HC3-textgen-qa',
+          # QA medium, has human responses -- humans tend to provide links instead of trying to answer
+          'pythonist/newdata',  # long context, QA, brief A
+          'ropes',  # long background, situation, question, A
+          'wikitablequestions',  # table -> QA
+          'bigscience/p3',  # context QA but short answers
+          ]
+code_useful = ['0n1xus/codexglue',
+               'openai_humaneval',
+               'koutch/staqc',
+               ]
+maybe_useful = ['AlekseyKorshuk/comedy-scripts',
+                'openbookqa',  # hard to parse, low reasoning
+                'qed',  # reasonable QA, but low reasoning
+                'selqa',  # candidate answers
+                'HuggingFaceH4/instruction-pilot-outputs-filtered',
+                'GBaker/MedQA-USMLE-4-options',  # medical QA with long questions
+                'npc-engine/light-batch-summarize-dialogue',  # dialog summarize, kinda low specific quality
+                ]
+summary_useful = ['austin/rheum_abstracts',
+                  'CarperAI/openai_summarize_comparisons',  # summarize chosen/rejected
+                  'CarperAI/openai_summarize_tldr',  # summarize QA
+                  'ccdv/cnn_dailymail',  # summarize news
+                  'ccdv/govreport-summarization',  # summarize high quality
+                  'ccdv/pubmed-summarization',  # summarize high quality
+                  'duorc',  # plot -> QA
+                  'farleyknight/big_patent_5_percent',  # desc -> abstract
+                  'multi_news',  # summary
+                  'opinosis',
+                  'SophieTr/reddit_clean',
+                  'allenai/mup',  # long text -> summary
+                  'allenai/multi_lexsum',  # long text -> summary
+                  'big_patent',
+                  'allenai/wcep_dense_max',
+                  'awinml/costco_long_practice',
+                  'GEM/xsum',
+                  'ratishsp/newshead',
+                  'RussianNLP/wikiomnia',  # russian
+                  'stacked-summaries/stacked-xsum-1024',
+                  ]
+math_useful = [
+    'competition_math'
+]
+skipped = ['c4',  # maybe useful, used for flan, but skipped due to size
+           ]
+"""
+To get training data from oig:
+pytest test_oig test_grade_final test_finalize_to_json
+"""
+human = '<human>:'
+bot = '<bot>:'
+def test_assemble_and_detox():
+    import re
+    from profanity_check import predict_prob
+    df_list = []
+    for data in useful_oig_files:
+        print("Processing %s" % data, flush=True)
+        df = pd.read_parquet(data)
+        df = df.reset_index(drop=True)
+        # chop up into human/bot interactions of no more than 10kB per row
+        text_list = df[['text']].values.ravel().tolist()
+        new_text = []
+        max_len = 2048  # uber cutoff
+        MAX_LEN = 2048 // 2 - 30  # max len per question/answer
+        for text in tqdm(text_list):
+            human_starts = [m.start() for m in re.finditer('<human>: ', text)]
+            if len(human_starts) == 1:
+                human_starts = [0, len(text)]  # always go into for loop below
+            blurb = ''
+            for i in range(len(human_starts) - 1):
+                interaction = text[human_starts[i]: human_starts[i + 1]][:max_len]
+                blurb += interaction
+                if len(blurb) >= MAX_LEN:
+                    blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                    new_text.append(blurb + "\n<human>:")
+                    blurb = ''
+            if blurb:
+                blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                new_text.append(blurb + "\n<human>:")
+        if len(new_text) > len(text_list):
+            print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0]))
+        df = pd.DataFrame({"text": new_text, "source": [data] * len(new_text)})
+        df = df.drop_duplicates(keep='first')
+        print(df['text'].apply(lambda x: len(x)).describe())
+        assert df['text'].apply(lambda x: len(x)).max() <= 2 * max_len
+        # faster than better_profanity, do early
+        df['profanity'] = predict_prob(df['text'])
+        before_rows = df.shape[0]
+        df = df[df['profanity'] < 0.25]  # drop any low quality stuff
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows))
+        df_list.append(df)
+        print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
+        print("So far have %d rows" % sum([len(x) for x in df_list]))
+    df_final = pd.concat(df_list)
+    df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True)
+    df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False)
+def test_basic_cleaning():
+    # from better_profanity import profanity
+    # https://pypi.org/project/alt-profanity-check/
+    from profanity_check import predict
+    df_list = []
+    for data in useful_oig_files:
+        # for data in useful_oig_files[:5]:
+        # for data in ['unified_openai_summarize_tldr.jsonl.parquet']:
+        print("Processing %s" % data, flush=True)
+        df = pd.read_parquet(data)
+        df = df.reset_index(drop=True)
+        # NOTE: Not correct if multiple human-bot interactions, but those dialogs even more desired
+        # avg_chars = len(df['text'][0])/(df['text'][0].count(human)+df['text'][0].count(bot))
+        df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot)) / 2.0)
+        df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot))
+        # df['bad_words'] = df['text'].apply(lambda x: profanity.contains_profanity(x))
+        # low_quality_patterns = ['Write the rest of this wikipedia article']
+        res = predict(df['text'])
+        df['bad_words'] = res
+        df = df.reset_index(drop=True)
+        df = df[df['bad_words'] == 0]
+        df = df[['text', 'avg_words', 'avg_bot_words']]
+        df = df.drop_duplicates(keep='first')
+        print(df[df['avg_words'] == df['avg_words'].max()]['text'].values)
+        median_words = np.median(df['avg_words'])
+        min_words_per_entity = max(30, 0.8 * median_words)
+        max_words_per_entity = 2048  # too hard to learn from for now
+        df = df[df['avg_words'] > min_words_per_entity]
+        df = df[df['avg_words'] < max_words_per_entity]
+        min_words_per_entity = max(20, 0.5 * median_words)  # bot should say stuff for now
+        max_words_per_entity = 2048  # too hard to learn from for now
+        df = df[df['avg_bot_words'] > min_words_per_entity]
+        df = df[df['avg_bot_words'] < max_words_per_entity]
+        df_list.append(df)
+        print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
+    df_final = pd.concat(df_list)
+    df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False)
+from joblib import Parallel, delayed, effective_n_jobs
+from sklearn.utils import gen_even_slices
+from sklearn.utils.validation import _num_samples
+def parallel_apply(df, func, n_jobs=-1, **kwargs):
+    """ Pandas apply in parallel using joblib.
+    Uses sklearn.utils to partition input evenly.
+    Args:
+        df: Pandas DataFrame, Series, or any other object that supports slicing and apply.
+        func: Callable to apply
+        n_jobs: Desired number of workers. Default value -1 means use all available cores.
+        **kwargs: Any additional parameters will be supplied to the apply function
+    Returns:
+        Same as for normal Pandas DataFrame.apply()
+    """
+    if effective_n_jobs(n_jobs) == 1:
+        return df.apply(func, **kwargs)
+    else:
+        ret = Parallel(n_jobs=n_jobs)(
+            delayed(type(df).apply)(df[s], func, **kwargs)
+            for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs)))
+        return pd.concat(ret)
+def add_better_profanity_flag(df):
+    from better_profanity import profanity
+    df['better_profanity'] = parallel_apply(
+        df['text'],
+        lambda x: profanity.contains_profanity(x),
+        n_jobs=-1,
+    )
+    return df
+def add_textstat_grade(df):
+    import textstat
+    def myfunc(x):
+        return textstat.flesch_kincaid_grade(x)  # simple grade
+    if False:
+        import dask.dataframe as dd
+        # 40 seconds for 1000 rows, but have 1,787,799 rows
+        ddata = dd.from_pandas(df, npartitions=120)
+        df['flesch_grade'] = ddata['text'].apply(myfunc).compute()
+    if True:
+        # fast way
+        df['flesch_grade'] = parallel_apply(df['text'], myfunc, n_jobs=-1)
+    return df
+def add_deberta_grade(df):
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
+    import torch
+    reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
+    rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(
+        reward_name), AutoTokenizer.from_pretrained(reward_name)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    rank_model.to(device)
+    def get_question(x):
+        return x.replace('<human>: ', '').split('<bot>:')[0]
+    def get_answer(x):
+        try:
+            answer = x.split('<bot>: ')[1].split('<human>:')[0].replace('<bot>: ', '')
+        except:
+            answer = x.split('<bot>:')[1].split('<human>:')[0].replace('<bot>:', '')
+        return answer
+    df['question'] = parallel_apply(df['text'], get_question, n_jobs=-1)
+    df['answer'] = parallel_apply(df['text'], get_answer, n_jobs=-1)
+    from datasets import Dataset
+    from transformers import pipeline
+    from transformers.pipelines.pt_utils import KeyPairDataset
+    import tqdm
+    pipe = pipeline(
+        "text-classification",
+        model=reward_name,
+        device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    start = 0
+    batch_size = 64 * 16
+    micro_batch = orig_micro_batch = 16
+    end = 0
+    import socket
+    checkpoint = "grades.%s.pkl" % socket.gethostname()
+    grades = []
+    import pickle
+    if os.path.exists(checkpoint):
+        with open(checkpoint, "rb") as f:
+            start, grades = pickle.loads(f.read())
+    last_oom = 0
+    while end < df.shape[0]:
+        # manual batching to handle OOM more gracefully
+        end = min(start + batch_size, df.shape[0])
+        if start == end:
+            break
+        dataset = Dataset.from_pandas(df.iloc[start:end, :])
+        try:
+            grades.extend([
+                x['score'] for x in tqdm.tqdm(
+                    pipe(KeyPairDataset(dataset, "question", "answer"), batch_size=micro_batch)
+                )
+            ])
+        except torch.cuda.OutOfMemoryError:
+            last_oom = start
+            micro_batch = max(1, micro_batch // 2)
+            print("OOM - retrying with micro_batch=%d" % micro_batch)
+            continue
+        if last_oom == start:
+            micro_batch = orig_micro_batch
+            print("Returning to micro_batch=%d" % micro_batch)
+        assert len(grades) == end
+        start = end
+        with open(checkpoint, "wb") as f:
+            f.write(pickle.dumps((end, grades)))
+        print("%d/%d" % (end, df.shape[0]))
+    df['grade_deberta'] = grades
+    if os.path.exists(checkpoint):
+        os.remove(checkpoint)
+    return df
+def test_chop_by_lengths():
+    file = "h2oGPT.cleaned.human_bot.shorter.parquet"
+    df = pd.read_parquet(file).reset_index(drop=True)
+    df = count_human_bot_lengths(df)
+    df['rand'] = np.random.rand(df.shape[0])
+    df['rand2'] = np.random.rand(df.shape[0])
+    before_rows = df.shape[0]
+    # throw away short human/bot responses with higher likelihood
+    df = df[(df['len_human_mean'] > 20)]  # never keep very short ones
+    df = df[(df['len_human_mean'] > 30) | (df['rand'] < 0.2)]
+    df = df[(df['len_human_mean'] > 50) | (df['rand'] < 0.5)]
+    df = df[(df['len_human_max'] < 10000)]  # drop super long (basically only human) ones
+    df = df[(df['len_bot_mean'] > 20)]  # never keep very short ones
+    df = df[(df['len_bot_mean'] > 30) | (df['rand2'] < 0.2)]
+    df = df[(df['len_bot_mean'] > 50) | (df['rand2'] < 0.5)]
+    df = df[(df['len_bot_max'] < 10000)]  # drop super long (only bot) ones
+    assert df['text'].apply(lambda x: len(x)).max() < 20000
+    df = df.drop(['rand', 'rand2'], axis=1)
+    after_rows = df.shape[0]
+    print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows))
+    print(df.describe())
+    df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False)
+def count_human_bot_lengths(df, human=None, bot=None):
+    import re
+    len_human_min = []
+    len_human_max = []
+    len_human_mean = []
+    len_bot_min = []
+    len_bot_max = []
+    len_bot_mean = []
+    human = human or '<human>:'
+    bot = bot or '<bot>:'
+    for is_human in [True, False]:
+        what = human if is_human else bot
+        other = human if not is_human else bot
+        for i in range(df.shape[0]):
+            text = df.loc[i, 'text']
+            assert isinstance(text, str)
+            starts = [m.start() for m in re.finditer(what, text)]
+            if len(starts) == 1:
+                starts = [starts[0], len(text)]  # always go into for loop below
+            assert len(text)
+            list_what = []
+            for ii in range(len(starts) - 1):
+                interaction = text[starts[ii]: starts[ii + 1]]
+                if other in interaction:
+                    interaction = interaction[:interaction.find(other)]
+                interaction.strip()
+                list_what.append(interaction)
+            if not list_what:
+                list_what = ['']  # handle corrupted data, very rare, leads to sizes 0
+            if is_human:
+                len_human_min.append(min([len(x) for x in list_what]))
+                len_human_max.append(max([len(x) for x in list_what]))
+                len_human_mean.append(np.mean([len(x) for x in list_what]))
+            else:
+                len_bot_min.append(min([len(x) for x in list_what]))
+                len_bot_max.append(max([len(x) for x in list_what]))
+                len_bot_mean.append(np.mean([len(x) for x in list_what]))
+    df['len_human_min'] = len_human_min
+    df['len_human_max'] = len_human_max
+    df['len_human_mean'] = len_human_mean
+    df['len_bot_min'] = len_bot_min
+    df['len_bot_max'] = len_bot_max
+    df['len_bot_mean'] = len_bot_mean
+    np.random.seed(1234)
+    pd.set_option('display.max_columns', None)
+    print("Before chopping")
+    print(df.describe())
+    return df
+def test_grade():
+    df = None
+    file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet"
+    output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet"
+    if not os.path.exists(output_file):
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_textstat_grade(df)
+        min_grade = 10
+        max_grade = 25
+        df = df[df['flesch_grade'] >= min_grade]
+        df = df[df['flesch_grade'] <= max_grade]
+        print("After Flesch grade")
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet"
+    if not os.path.exists(output_file):
+        # slower than alt-profanity, do last, but do before deberta grading, since that's slower
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_better_profanity_flag(df)
+        before_rows = df.shape[0]
+        df = df[df['better_profanity'] == 0]
+        df = df.drop(['better_profanity'], axis=1)
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to better_profanity" % (before_rows - after_rows, before_rows))
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet'
+    if not os.path.exists(output_file):
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_deberta_grade(df)
+        min_grade = 0.3
+        max_grade = np.inf
+        before_rows = df.shape[0]
+        df = df[df['grade_deberta'] >= min_grade]
+        df = df[df['grade_deberta'] <= max_grade]
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows))
+        print("After DeBERTa grade")
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet'
+    if df is None:
+        df = pd.read_parquet(file).reset_index(drop=True)
+    df.to_parquet(output_file, index=False)
+@pytest.mark.parametrize(
+    "fixup_personality, only_personality, deberta_grading",
+    [
+        [False, False, False],
+        [True, True, False],
+        [True, False, False],
+        [True, False, True],
+    ]
+)
+def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, save_json=True):
+    """
+    Flatten tree structure into one row per path from root to leaf
+    Also turn into human_bot prompting format:
+        <human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc.
+    Also saves a .json locally as side-effect
+    returns list of dicts, containing intput, prompt_type and source
+    """
+    from datasets import load_dataset
+    data_file = "OpenAssistant/oasst1"
+    ds = load_dataset(data_file)
+    df = pd.concat([ds['train'].to_pandas(), ds['validation'].to_pandas()], axis=0)
+    rows = {}
+    message_ids = df['message_id'].values.tolist()
+    message_tree_ids = df['message_tree_id'].values.tolist()
+    parent_ids = df['parent_id'].values.tolist()
+    texts = df['text'].values.tolist()
+    roles = df['role'].values.tolist()
+    for i in range(df.shape[0]):
+        # collect all trees
+        message_id = message_ids[i]
+        message_tree_id = message_tree_ids[i]
+        parent_id = parent_ids[i]
+        text = texts[i]
+        if fixup_personality:
+            text = text.replace("Open Assistant", "h2oGPT")
+            text = text.replace("Open-Assistant", "h2oGPT")
+            text = text.replace("open-assistant", "h2oGPT")
+            text = text.replace("OpenAssistant", "h2oGPT")
+            text = text.replace("open assistant", "h2oGPT")
+            text = text.replace("Open Assistand", "h2oGPT")
+            text = text.replace("Open Assitant", "h2oGPT")
+            text = text.replace("Open Assistent", "h2oGPT")
+            text = text.replace("Open Assisstant", "h2oGPT")
+            text = text.replace("Open Assitent", "h2oGPT")
+            text = text.replace("Open Assitiant", "h2oGPT")
+            text = text.replace("Open Assistiant", "h2oGPT")
+            text = text.replace("Open Assitan ", "h2oGPT ")
+            text = text.replace("Open Assistan ", "h2oGPT ")
+            text = text.replace("Open Asistant", "h2oGPT")
+            text = text.replace("Open Assiant", "h2oGPT")
+            text = text.replace("Assistant", "h2oGPT")
+            text = text.replace("LAION AI", "H2O.ai")
+            text = text.replace("LAION-AI", "H2O.ai")
+            text = text.replace("LAION,", "H2O.ai,")
+            text = text.replace("LAION.ai", "H2O.ai")
+            text = text.replace("LAION.", "H2O.ai.")
+            text = text.replace("LAION", "H2O.ai")
+        role = roles[i]
+        new_data = ('<human>: ' if role == 'prompter' else '<bot>: ') + text
+        entry = dict(message_id=message_id, parent_id=parent_id, text=new_data)
+        if message_tree_id not in rows:
+            rows[message_tree_id] = [entry]
+        else:
+            rows[message_tree_id].append(entry)
+    all_rows = []
+    for node_id in rows:
+        # order responses in tree, based on message/parent relationship
+        conversations = []
+        list_msgs = rows[node_id]
+        # find start
+        while len(list_msgs):
+            for i, leaf in enumerate(list_msgs):
+                found = False
+                parent_id = leaf['parent_id']
+                if parent_id is None:
+                    # conversation starter
+                    conversations.append(leaf)
+                    found = True
+                else:
+                    for conv in conversations:
+                        # find all conversations to add my message to
+                        if parent_id in conv['message_id'] and parent_id != conv['message_id'][-len(parent_id):]:
+                            # my message doesn't follow conversation
+                            continue
+                        if parent_id == conv['message_id'][-len(parent_id):]:
+                            # my message follows conversation, but fork first, so another follow-on message can do same
+                            conversations.append(conv.copy())
+                            conv['text'] += f"""
+{leaf['text']}
+"""
+                            conv['message_id'] += leaf['message_id']
+                            found = True
+                            break
+                if found:
+                    # my content was used, so nuke from list
+                    del list_msgs[i]
+                    break
+        # now reduce down to final conversations, find the longest chains of message ids
+        for i, conv in enumerate(conversations):
+            for j, conv2 in enumerate(conversations):
+                if i == j:
+                    continue
+                if conv['message_id'] and conv2['message_id']:
+                    assert conv['message_id'] != conv2['message_id']
+                    # delete the shorter conversation, if one contains the other
+                    if conv['message_id'] in conv2['message_id']:
+                        conv['message_id'] = None
+                    if conv2['message_id'] in conv['message_id']:
+                        conv2['message_id'] = None
+        conversations = [c for c in conversations if c['message_id']]
+        if only_personality:
+            all_rows.extend(
+                [dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if
+                 'h2oGPT' in c['text']])
+        else:
+            all_rows.extend(
+                [dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if
+                 "What is H2O.ai" not in c['text']])
+    unhelpful = get_unhelpful_list()
+    all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
+    personality = create_personality_data()
+    all_rows.extend(personality * 10)
+    np.random.seed(123)
+    np.random.shuffle(all_rows)
+    print(len(all_rows))
+    if deberta_grading:
+        df = pd.DataFrame(all_rows)
+        df = df.rename(columns={'input': 'text'})
+        df = add_deberta_grade(df)
+        df = df.rename(columns={'text': 'input'})
+        drop = True
+        if drop:
+            min_grade = 0.3
+            max_grade = np.inf
+            before_rows = df.shape[0]
+            df = df[df['grade_deberta'] >= min_grade]
+            df = df[df['grade_deberta'] <= max_grade]
+            after_rows = df.shape[0]
+            print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows))
+            print("After DeBERTa grade")
+        print(df.describe())
+        all_rows = []
+        for i in range(df.shape[0]):
+            all_rows.append(
+                dict(
+                    input=df['input'].iloc[i],
+                    source=df['source'].iloc[i],
+                    prompt_type=df['prompt_type'].iloc[i],
+                    grade_deberta=df['grade_deberta'].iloc[i],
+                )
+            )
+    if save_json:
+        data_file = data_file + \
+                    ("_h2ogpt" if fixup_personality else "") + \
+                    ("_only" if only_personality else "") + \
+                    ("_graded" if deberta_grading else "")
+        for i in range(len(all_rows)):
+            all_rows[i]['id'] = i
+        with open(data_file.lower().replace("/", "_") + ".json", "w") as f:
+            f.write(json.dumps(all_rows, indent=2))
+    return all_rows
+def test_finalize_to_json():
+    df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet')
+    df = df.rename(columns={'text': 'input'})
+    print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    print("Adding open assistant data")
+    with open("openassistant_oasst1_h2ogpt_graded.json") as f:
+        open_assistant = json.loads(f.read())
+    df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0)
+    def final_clean(df):
+        from better_profanity import profanity
+        profanity.load_censor_words_from_file("data/censor_words.txt")
+        df['profanity'] = parallel_apply(
+            df['input'],
+            lambda x: profanity.contains_profanity(x),
+            n_jobs=-1,
+        )
+        return df[(df['profanity'] == 0)].reset_index(drop=True)
+    print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    df = final_clean(df)
+    print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    print(df.describe())
+    print(df.shape)
+    row_list = []
+    for i in range(df.shape[0]):
+        row_list.append(
+            dict(
+                input=df.loc[i, 'input'],
+                source=df.loc[i, 'source'],
+                prompt_type='plain',
+            )
+        )
+    np.random.seed(1234)
+    np.random.shuffle(row_list)
+    unhelpful = get_unhelpful_list()
+    row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)]
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+        row_list[i]['input'] = row_list[i]['input'].replace(" <bot>:", "\n<bot>:")
+    with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f:
+        f.write(json.dumps(row_list, indent=2))
+def create_personality_data():
+    questions = [
+        "What's your name?",
+        "What is your name?",
+        "What are you?",
+        "Who are you?",
+        "Do you have a name?",
+        "Who trained you?",
+        "Who created you?",
+        "Who made you?",
+    ]
+    answers = [
+        "I'm h2oGPT, a large language model by H2O.ai.",
+        "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.",
+        "My name is h2oGPT. I'm a large language model by H2O.ai, the visionary leader in democratizing AI.",
+        "My name is h2oGPT. I'm a large language model trained by H2O.ai.",
+        "Hi! I'm h2oGPT, a large language model by H2O.ai.",
+        "Hi! I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.",
+    ]
+    help = [
+        "",
+        " How can I help you?",
+        " How may I assist you?",
+        " Nice to meet you.",
+    ]
+    import itertools
+    rows = []
+    for pair in itertools.product(questions, answers, help):
+        rows.append(
+            dict(input=f"<human>: {pair[0]}\n<bot>: {pair[1]}{pair[2]}\n<human>:", prompt_type='plain', source="H2O.ai")
+        )
+    for row in [
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: Who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+    ]:
+        rows.append(dict(input=row, prompt_type='plain', source='H2O.ai'))
+    print(len(rows))
+    with open("h2ogpt-personality.json", "w") as f:
+        f.write(json.dumps(rows, indent=2))
+    return rows
+def test_check_stats_data():
+    filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json'
+    df = pd.read_json(filename)
+    # get word stats
+    df['char_count'] = df['input'].apply(lambda x: len(x))
+    import matplotlib.pyplot as plt
+    plt.figure(figsize=(10, 10))
+    plt.hist(df['char_count'], bins=100)
+    chars_avg = np.mean(df['char_count'])
+    chars_median = np.median(df['char_count'])
+    plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median))
+    plt.savefig('chars_hist.png')
+    plt.close()
+    # get tokenize stats for random sample of 1000 rows
+    from finetune import generate_and_tokenize_prompt
+    from loaders import get_loaders, get_tokenizer
+    from functools import partial
+    llama_type = False
+    tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b'
+    model_loader, tokenizer_loader = get_loaders(model_name=base_model, reward_type=False, llama_type=llama_type)
+    local_files_only = False
+    resume_download = True
+    use_auth_token = False
+    tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
+    prompt_type = 'plain'  # trained with data already in human bot form
+    train_on_inputs = True
+    add_eos_token = False
+    cutoff_len = 512  # can choose 2048
+    generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
+                                               train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
+                                               cutoff_len=cutoff_len, tokenizer=tokenizer)
+    from datasets import load_dataset
+    data = load_dataset("json", data_files={"train": filename})
+    val_set_size = 0.90
+    train_val = data["train"].train_test_split(
+        test_size=val_set_size, shuffle=True, seed=42
+    )
+    train_data = train_val["train"]
+    train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count())
+    df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count'])
+    plt.figure(figsize=(10, 10))
+    plt.hist(df_tokens['token_count'], bins=100)
+    token_avg = np.mean(df_tokens['token_count'])
+    token_median = np.median(df_tokens['token_count'])
+    plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median))
+    plt.savefig('token_hist_%s.png' % cutoff_len)
+    plt.close()
+def get_unhelpful_list():
+    # base versions
+    unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
+                 "I'm sorry, but I don't understand your question. Could you please rephrase it?",
+                 "I'm sorry, I don't quite understand your question",
+                 "I'm sorry, I don't know",
+                 "I'm sorry, but I don't know",
+                 "I don't know anything",
+                 "I do not know",
+                 "I don't know",
+                 "I don't know how",
+                 "I do not know how",
+                 "Can you please explain what you mean",
+                 "please explain what you mean",
+                 "please explain",
+                 "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by",
+                 "I'm sorry but I don't understand what you mean",
+                 "I don't understand",
+                 "I don't have the ability",
+                 "I do not have the ability",
+                 "I do not have",
+                 "I am a language model,",
+                 "I am a large language model,",
+                 "I do not understand your question. Can you please try to make it clearer?",
+                 "I'm sorry, but as an AI language model",
+                 "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.",
+                 "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?",
+                 "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t",
+                 "I apologize, but I cannot perform the task you have requested.",
+                 "I'm sorry, I cannot perform this task as I am an AI language model and do not have access",
+                 "I'm sorry, I'm not sure what you're asking for here.",
+                 "I'm not sure what you are asking",
+                 "You need to provide more context",
+                 ]
+    # reduced versions, with redundant parts, just to give context for where they came from
+    unhelpful += ["sorry, I didn't quite understand your question",
+                  "I didn't quite understand your question",
+                  "I didn't understand your question",
+                  "I did not understand your question",
+                  "I did not understand the question",
+                  "could you please rephrase"
+                  "could you rephrase"
+                  "I do not understand your question.",
+                  "I do not understand the question.",
+                  "I do not understand that question.",
+                  "Can you please try to make it clearer",
+                  "Can you try to make it clearer",
+                  "sorry, but as an AI language model",
+                  "as an AI language model",
+                  "I apologize, but I cannot",
+                  "I cannot rephrase text",
+                  "I cannot understand. Your post is difficult to read and follow."
+                  "Your post is difficult to read and follow."
+                  "I apologize, but I am",
+                  "Sorry, but I am not ",
+                  "nor am I capable",
+                  "I am not capable of",
+                  "I apologize, but I cannot perform the task you have requested",
+                  "I cannot perform the task",
+                  "I cannot complete the task",
+                  "I'm sorry",
+                  "I am sorry",
+                  "do not have access",
+                  "not sure what you're asking for",
+                  "not sure what you are asking for",
+                  "not sure what is being asked",
+                  "I'm not sure what you are asking",
+                  "not sure what you are asking",
+                  "You need to provide more context",
+                  "provide more context",
+                  ]
+    unhelpful += ["As a large language model",
+                  "cannot provide any information",
+                  "As an artificial intelligence I do not have the capability",
+                  "As an artificial intelligence I don't have the capability",
+                  "As an artificial intelligence I can't",
+                  "As an artificial intelligence I cannot",
+                  "I am sorry but I do not understand",
+                  "Can you please explain",
+                  "(sorry couldn't resist)",
+                  "(sorry could not resist)",
+                  " :)",
+                  " ;)",
+                  " :-)",
+                  " ;-)",
+                  " lol ",
+                  "Thanks so much!!!",
+                  "Thank You :)!!!",
+                  "Please try not to repeat",
+                  "I am an AI language model",
+                  "I'm a AI assistant that",
+                  "I'm an AI assistant that",
+                  "I am an AI assistant that",
+                  "etc.",
+                  "etc.etc.",
+                  "etc. etc.",
+                  "etc etc",
+                  ]
+    return unhelpful
+def test_check_unhelpful():
+    # file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
+    file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
+    # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
+    unhelpful = get_unhelpful_list()
+    # data = json.load(open(file, 'rt'))
+    df = pd.read_json(file)
+    use_reward_score_threshold = False
+    use_bleu_threshold = False
+    use_sentence_sim = True
+    from sacrebleu.metrics import BLEU
+    bleu = BLEU()
+    from nltk.translate.bleu_score import sentence_bleu
+    def get_bleu(actual, expected_list):
+        # return bleu.sentence_score(actual, expected_list).score
+        return sentence_bleu(expected_list, actual)
+    threshold = 0.0
+    if use_reward_score_threshold:
+        df = df[df['grade_deberta'] > threshold]
+    # back to as if original json load
+    data = df.to_dict(orient='records')
+    bads = {}
+    string_all = str(data)
+    for sub in unhelpful:
+        bads[sub] = string_all.count(sub)
+    bads = {k: v for k, v in bads.items() if v > 0}
+    import pprint
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(bads)
+    total_bads = sum(list(bads.values()))
+    print('total_bads: %s' % total_bads, flush=True)
+    # check just bot
+    import re
+    convs = [[x.strip() for x in re.split(r'%s|%s' % (human, bot), y['input']) if x.strip()] for y in data]
+    humans = [[x for i, x in enumerate(y) if i % 2 == 0] for y in convs]
+    bots = [[x for i, x in enumerate(y) if i % 2 == 1] for y in convs]
+    # FIXME: apply back to json etc., just see for now
+    bleu_threshold = 0.9
+    if use_bleu_threshold:
+        bots = [[x for x in y if get_bleu(x, unhelpful) < bleu_threshold] for y in tqdm(bots)]
+    cosine_sim_threshold = 0.8
+    if use_sentence_sim:
+        # pip install sentence_transformers-2.2.2
+        from sentence_transformers import SentenceTransformer
+        # sent_model = 'bert-base-nli-mean-tokens'
+        # sent_model = 'nli-distilroberta-base-v2'
+        sent_model = 'all-MiniLM-L6-v2'
+        model = SentenceTransformer(sent_model)
+        sentence_embeddings = model.encode(unhelpful)
+        from sklearn.metrics.pairwise import cosine_similarity
+        bots = [x for x in tqdm(bots) if
+                np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold]
+    bads_bots = {}
+    string_all = str(bots)
+    for sub in unhelpful:
+        bads_bots[sub] = string_all.count(sub)
+    bads_bots = {k: v for k, v in bads_bots.items() if v > 0}
+    import pprint
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(bads_bots)
+    total_bads_bots = sum(list(bads_bots.values()))
+    print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % (
+    threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True)
+    # assert len(bads) == 0, bads
+    assert len(bads_bots) == 0, bads_bots
+def test_fortune2000_personalized():
+    row_list = []
+    import glob
+    if not os.path.isdir("wikitext"):
+        raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip")
+    for file in glob.glob("wikitext/*.txt"):
+        with open(file, "r") as f:
+            blob = f.read()
+        N = 512 * 4
+        row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)}
+                         for s in get_sentences(blob, N) if s])
+    personality = create_personality_data()
+    import copy
+    for i in range(10):
+        row_list.extend(copy.deepcopy(personality))
+    np.random.seed(123)
+    np.random.shuffle(row_list)
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+    for i in range(len(row_list)):
+        assert row_list[i]['id'] == i
+    with open("h2ogpt-fortune2000-personalized.json", "w") as ff:
+        ff.write(json.dumps(row_list, indent=2))

data/NGSL_1.2_stats.csv.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34993377b20347d5c8837a1101bde4e403232f5f08c80f9441e16ac7a23228a7
+size 25168

data/README-template.md ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+license: apache-2.0
+language:
+- en
+thumbnail: https://h2o.ai/etc.clientlibs/h2o/clientlibs/clientlib-site/resources/images/favicon.ico
+tags:
+- gpt
+- llm
+- large language model
+- open-source
+---
+# h2oGPT Data Card
+## Summary
+H2O.ai's `<<DATASET_NAME>>` is an open-source instruct-type dataset for fine-tuning of large language models, licensed for commercial use.
+- Number of rows: `<<NROWS>>`
+- Number of columns: `<<NCOLS>>`
+- Column names: `<<COLNAMES>>`
+## Source
+<<SOURCE_LINK>>

data/censor_words.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fuck-buddy
+fuck-buddys
+clusterfuck
+fuckup
+fuckups
+dumbfuck
+dumbfucks
+mindfuck
+*fucking
+fuckin'

data/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/count_1w.txt.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:434d0f21e4156386a3e9156c669e316878fbb1664414ffe06f22e254a180d363
+size 2234946

data/create_data_cards.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import shutil
+import pandas as pd
+import os
+import huggingface_hub
+import pytest
+from datasets import load_dataset
+@pytest.mark.parametrize(
+    "dataset_name, link_to_source",
+    [
+        (
+                "h2ogpt-oig-instruct-cleaned",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/b8f15efcc305a953c52a0ee25b8b4897ceb68c0a/scrape_dai_docs.py)
+"""
+        ),
+        (
+                "h2ogpt-oig-instruct-cleaned-v2",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/40c217f610766715acec297a5535eb440ac2f2e2/create_data.py)
+"""
+        ),
+        (
+                "h2ogpt-oig-instruct-cleaned-v3",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/bfc3778c8db938761ce2093351bf2bf82159291e/create_data.py)
+"""
+        ),
+        (
+                "openassistant_oasst1",
+                """
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/45e6183171fb16691ad7d3ab006fad973f971e98/create_data.py#L1253)
+"""
+        ),
+        (
+                "h2ogpt-oig-oasst1-instruct-cleaned-v1",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/5fc91911bc2bfaaf3b6c2de577c4b0ae45a07a4a/create_data.py#L1253)
+"""
+        ),
+        (
+                "h2ogpt-oig-oasst1-instruct-cleaned-v2",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/0e70c2fbb16410bd8e6992d879b4c55cd981211f/create_data.py#L1375-L1415)
+"""
+        ),
+        (
+                "h2ogpt-oig-oasst1-instruct-cleaned-v3",
+                """
+- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
+- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/6728938a262d3eb5e8db1f252bbcd7de838da452/create_data.py#L1415)
+"""
+        ),
+        (
+                "openassistant_oasst1_h2ogpt",
+                """
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/83857fcf7d3b712aad5db32207e6db0ab0f780f9/create_data.py#L1252)
+"""
+        ),
+        (
+                "openassistant_oasst1_h2ogpt_graded",
+                """
+- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/d1f8ce975a46056d41135d126dd33de8499aa26e/create_data.py#L1259)
+"""
+        ),
+        (
+                "h2ogpt-fortune2000-personalized",
+                """
+- [Fortune 2000 companies from Wikipedia](https://github.com/h2oai/h2ogpt/blob/b1ea74c0088884ebff97f1ccddbfb3f393e29e44/create_data.py#L1743)
+"""
+        ),
+    ],
+)
+def test_create_data_cards(dataset_name, link_to_source):
+    if dataset_name != "h2ogpt-fortune2000-personalized":
+        return
+    #
+    assert os.path.exists("README-template.md"), "must be running this test from the data dir."
+    shutil.rmtree(dataset_name, ignore_errors=True)
+    try:
+        repo = huggingface_hub.Repository(
+            local_dir=dataset_name,
+            clone_from="h2oai/%s" % dataset_name,
+            repo_type="dataset",
+            skip_lfs_files=True,
+            token=True,
+        )
+        repo.git_pull()
+    except Exception as e:
+        print(str(e))
+        print("call 'huggingface_cli login' first and provide access token with write permission")
+    dataset = load_dataset("h2oai/%s" % dataset_name)["train"]
+    pd.set_option('display.max_columns', None)
+    with open("README-template.md", "r") as f:
+        content = f.read()
+        assert "<<DATASET_NAME>>" in content
+        content = content.replace("<<DATASET_NAME>>", dataset_name)
+        assert "<<NROWS>>" in content
+        content = content.replace("<<NROWS>>", str(dataset.num_rows))
+        assert "<<NCOLS>>" in content
+        content = content.replace("<<NCOLS>>", str(dataset.num_columns))
+        assert "<<COLNAMES>>" in content
+        content = content.replace("<<COLNAMES>>", str(dataset.column_names))
+        # assert "<<PREVIEW>>" in content
+        # content = content.replace("<<PREVIEW>>", str(dataset.to_pandas().iloc[:5, :]))
+        assert "<<SOURCE_LINK>>" in content
+        content = content.replace("<<SOURCE_LINK>>", link_to_source)
+        assert "<<" not in content
+        assert ">>" not in content
+    with open(os.path.join(dataset_name, "README.md"), "w") as f:
+        f.write(content)
+    try:
+        repo.commit("Update README.md")
+        repo.push_to_hub()
+    except Exception as e:
+        print(str(e))

data/dai_docs.train.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/dai_docs.train_cleaned.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/dai_docs.valid.json ADDED Viewed

	@@ -0,0 +1,101 @@

+[
+  {
+    "output": " .. _feature_store:\n\n##########################\nH2O AI Feature Store Setup\n##########################\n\nYou can use the H2O AI Feature Store to store, update, and share the features data scientists, developers, and engineers need to build AI models."
+  },
+  {
+    "output": " Note: For more information on the H2O AI Feature Store, refer to the `official documentation <https://docs.h2o.ai/feature-store/latest-stable/docs/index.html>`_."
+  },
+  {
+    "output": " To enable the Feature Store data connector, ``feature_store`` must be added to this list of data sources."
+  },
+  {
+    "output": " - ``feature_store_enable_tls``: To enable TLS communication between DAI and the Feature Store server, set this to ``true``."
+  },
+  {
+    "output": " - ``authentication_method``: The authentication method used by DAI. When enabling the Feature Store data connector, this must be set to OpenID Connect (``authentication_method=\"oidc\"``)."
+  },
+  {
+    "output": " \nUpgrading the Driverless AI Image\n~\n\n.. include:: upgrade-warning.frag\n \nUpgrading from Version 1.2.2 or Earlier\n'\n\nIt is not possible to upgrade from version 1.2.2 or earlier to the latest version."
+  },
+  {
+    "output": " Be sure to backup your data before doing this. Upgrading from Version 1.3.0 to 1.5.1\n'\n\n1. SSH into the IP address of the image instance and copy the existing experiments to a backup location:\n\n .. code-block:: bash\n\n  # Set up a directory of the previous version name\n  mkdir dai_rel_1.3.0\n\n  # Copy the data, log, license, and tmp directories as backup\n  cp -a ./data dai_rel_1.3.0/data\n  cp -a ./log dai_rel_1.3.0/log\n  cp -a ./license dai_rel_1.3.0/license\n  cp -a ./tmp dai_rel_1.3.0/tmp\n\n2."
+  },
+  {
+    "output": " Replace VERSION and BUILD below with the Driverless AI version. .. code-block:: bash\n\n   wget https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/dai/VERSION-BUILD/x86_64/dai-docker-ubi8-x86_64-VERSION.tar.gz\n\n3."
+  },
+  {
+    "output": " Run ``docker images`` to find the new image tag. 5. Start the Driverless AI Docker image and replace TAG below with the image tag."
+  },
+  {
+    "output": " Note: Use ``docker version`` to check which version of Docker you are using. .. tabs::\n\n   .. tab:: >= Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      docker run runtime=nvidia \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\n   .. tab:: < Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      nvidia-docker run \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\nUpgrading from version 1.5.2 or Later\n'\n\nUpgrading to versions 1.5.2 and later is no longer done via Docker."
+  },
+  {
+    "output": " Replace ``dai_NEWVERSION.deb`` below with the new Driverless AI version (for example, ``dai_1.8.4.1_amd64.deb``)."
+  },
+  {
+    "output": " You do not need to manually specify the DAI_USER or DAI_GROUP environment variables during an upgrade."
+  },
+  {
+    "output": " Driverless AI ships with CUDA 11.2.2 for GPUs, but the driver must exist in the host environment. Go to `NVIDIA download driver <https://www.nvidia.com/Download/index.aspx>`__ to get the latest NVIDIA Tesla A/T/V/P/K series drivers."
+  },
+  {
+    "output": " .. note::\n\tIf you are using K80 GPUs, the minimum required NVIDIA driver version is 450.80.02. .. code-block:: bash\n\n  # Stop Driverless AI."
+  },
+  {
+    "output": " .. _feature_store:\n\n##########################\nH2O AI Feature Store Setup\n##########################\n\nYou can use the H2O AI Feature Store to store, update, and share the features data scientists, developers, and engineers need to build AI models."
+  },
+  {
+    "output": " Note: For more information on the H2O AI Feature Store, refer to the `official documentation <https://docs.h2o.ai/feature-store/latest-stable/docs/index.html>`_. Description of relevant configuration attributes\n\n\nThe following are descriptions of the relevant configuration attributes when enabling the H2O AI Feature Store data connector:\n\n- ``enabled_file_systems``: A list of file systems you want to enable."
+  },
+  {
+    "output": " - ``feature_store_endpoint_url``: A URL that points to the Feature Store server. - ``feature_store_enable_tls``: To enable TLS communication between DAI and the Feature Store server, set this to ``true``."
+  },
+  {
+    "output": " - ``authentication_method``: The authentication method used by DAI. When enabling the Feature Store data connector, this must be set to OpenID Connect (``authentication_method=\"oidc\"``). For information on setting up OIDC Authentication in Driverless AI, see :ref:`oidc_auth`."
+  },
+  {
+    "output": " \nUpgrading the Driverless AI Image\n~\n\n.. include:: upgrade-warning.frag\n \nUpgrading from Version 1.2.2 or Earlier\n'\n\nIt is not possible to upgrade from version 1.2.2 or earlier to the latest version."
+  },
+  {
+    "output": " Be sure to backup your data before doing this. Upgrading from Version 1.3.0 to 1.5.1\n'\n\n1. SSH into the IP address of the image instance and copy the existing experiments to a backup location:\n\n .. code-block:: bash\n\n  # Set up a directory of the previous version name\n  mkdir dai_rel_1.3.0\n\n  # Copy the data, log, license, and tmp directories as backup\n  cp -a ./data dai_rel_1.3.0/data\n  cp -a ./log dai_rel_1.3.0/log\n  cp -a ./license dai_rel_1.3.0/license\n  cp -a ./tmp dai_rel_1.3.0/tmp\n\n2."
+  },
+  {
+    "output": " Replace VERSION and BUILD below with the Driverless AI version. .. code-block:: bash\n\n   wget https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/dai/VERSION-BUILD/x86_64/dai-docker-ubi8-x86_64-VERSION.tar.gz\n\n3."
+  },
+  {
+    "output": " Run ``docker images`` to find the new image tag. 5. Start the Driverless AI Docker image and replace TAG below with the image tag. Depending on your install version, use the ``docker run runtime=nvidia`` (>= Docker 19.03) or ``nvidia-docker`` (< Docker 19.03) command."
+  },
+  {
+    "output": " .. tabs::\n\n   .. tab:: >= Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      docker run runtime=nvidia \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\n   .. tab:: < Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      nvidia-docker run \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\nUpgrading from version 1.5.2 or Later\n'\n\nUpgrading to versions 1.5.2 and later is no longer done via Docker."
+  },
+  {
+    "output": " Replace ``dai_NEWVERSION.deb`` below with the new Driverless AI version (for example, ``dai_1.8.4.1_amd64.deb``). Note that this upgrade process inherits the service user and group from /etc/dai/User.conf and /etc/dai/Group.conf."
+  },
+  {
+    "output": " We recommend to have NVIDIA driver >= |NVIDIA-driver-ver| installed (GPU only) in your host environment for a seamless experience on all architectures, including Ampere. Driverless AI ships with CUDA 11.2.2 for GPUs, but the driver must exist in the host environment."
+  },
+  {
+    "output": " For reference on CUDA Toolkit and Minimum Required Driver Versions and CUDA Toolkit and Corresponding Driver Versions, see `here <https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html>`__ ."
+  },
+  {
+    "output": " .. _feature_store:\n\n##########################\nH2O AI Feature Store Setup\n##########################\n\nYou can use the H2O AI Feature Store to store, update, and share the features data scientists, developers, and engineers need to build AI models. This page describes how to configure Driverless AI to work with the H2O AI Feature Store. Note: For more information on the H2O AI Feature Store, refer to the `official documentation <https://docs.h2o.ai/feature-store/latest-stable/docs/index.html>`_."
+  },
+  {
+    "output": " To enable the Feature Store data connector, ``feature_store`` must be added to this list of data sources. - ``feature_store_endpoint_url``: A URL that points to the Feature Store server. - ``feature_store_enable_tls``: To enable TLS communication between DAI and the Feature Store server, set this to ``true``. - ``feature_store_access_token_scopes``: A space-separated list of access token scopes used by the Feature Store connector for authentication. - ``authentication_method``: The authentication method used by DAI."
+  },
+  {
+    "output": " \nUpgrading the Driverless AI Image\n~\n\n.. include:: upgrade-warning.frag\n \nUpgrading from Version 1.2.2 or Earlier\n'\n\nIt is not possible to upgrade from version 1.2.2 or earlier to the latest version. You have to manually remove the 1.2.2 container and then reinstall the latest Driverless AI version. Be sure to backup your data before doing this. Upgrading from Version 1.3.0 to 1.5.1\n'\n\n1. SSH into the IP address of the image instance and copy the existing experiments to a backup location:\n\n .. code-block:: bash\n\n  # Set up a directory of the previous version name\n  mkdir dai_rel_1.3.0\n\n  # Copy the data, log, license, and tmp directories as backup\n  cp -a ./data dai_rel_1.3.0/data\n  cp -a ./log dai_rel_1.3.0/log\n  cp -a ./license dai_rel_1.3.0/license\n  cp -a ./tmp dai_rel_1.3.0/tmp\n\n2."
+  },
+  {
+    "output": " Replace VERSION and BUILD below with the Driverless AI version. .. code-block:: bash\n\n   wget https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/dai/VERSION-BUILD/x86_64/dai-docker-ubi8-x86_64-VERSION.tar.gz\n\n3. Use the ``docker load`` command to load the image:\n\n .. code-block:: bash\n\n   docker load < dai-docker-ubi8-x86_64-VERSION.tar.gz\n\n4. Run ``docker images`` to find the new image tag. 5. Start the Driverless AI Docker image and replace TAG below with the image tag. Depending on your install version, use the ``docker run runtime=nvidia`` (>= Docker 19.03) or ``nvidia-docker`` (< Docker 19.03) command."
+  },
+  {
+    "output": " .. tabs::\n\n   .. tab:: >= Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      docker run runtime=nvidia \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\n   .. tab:: < Docker 19.03\n\n    .. code-block:: bash\n\n      # Start the Driverless AI Docker image\n      nvidia-docker run \\\n          pid=host \\\n          init \\\n          rm \\\n          shm-size=256m \\\n          -u `id -u`:`id -g` \\\n          -p 12345:12345 \\\n          -v `pwd`/data:/data \\\n          -v `pwd`/log:/log \\\n          -v `pwd`/license:/license \\\n          -v `pwd`/tmp:/tmp \\\n          h2oai/dai-ubi8-x86_64:TAG\n\nUpgrading from version 1.5.2 or Later\n'\n\nUpgrading to versions 1.5.2 and later is no longer done via Docker."
+  },
+  {
+    "output": " Replace ``dai_NEWVERSION.deb`` below with the new Driverless AI version (for example, ``dai_1.8.4.1_amd64.deb``). Note that this upgrade process inherits the service user and group from /etc/dai/User.conf and /etc/dai/Group.conf. You do not need to manually specify the DAI_USER or DAI_GROUP environment variables during an upgrade. We recommend to have NVIDIA driver >= |NVIDIA-driver-ver| installed (GPU only) in your host environment for a seamless experience on all architectures, including Ampere."
+  },
+  {
+    "output": " Go to `NVIDIA download driver <https://www.nvidia.com/Download/index.aspx>`__ to get the latest NVIDIA Tesla A/T/V/P/K series drivers. For reference on CUDA Toolkit and Minimum Required Driver Versions and CUDA Toolkit and Corresponding Driver Versions, see `here <https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html>`__ . .. note::\n\tIf you are using K80 GPUs, the minimum required NVIDIA driver version is 450.80.02. .. code-block:: bash\n\n  # Stop Driverless AI. sudo systemctl stop dai\n\n  # Backup your /opt/h2oai/dai/tmp directory at this time."
+  }
+]

data/dai_faq.json ADDED Viewed

	@@ -0,0 +1,477 @@

+[
+  {
+    "instruction": "How is Driverless AI different than any other black box ML algorithm?",
+    "output": "Driverless AI uses many techniques (some older and some cutting-edge) for interpreting black box models including creating reason codes for every prediction the system makes. We have also created numerous open source code examples and free publications that explain these techniques. See the list below for links to these resources and for references for the interpretability techniques.Open source interpretability examples:https://github.com/jphall663/interpretable_machine_learning_with_pythonhttps://content.oreilly.com/oriole/Interpretable-machine-learning-with-Python-XGBoost-and-H2Ohttps://github.com/h2oai/mli-resourcesFree Machine Learning Interpretability publications:http://www.oreilly.com/data/free/an-introduction-to-machine-learning-interpretability.csphttp://docs.h2o.ai/driverless-ai/latest-stable/docs/booklets/MLIBooklet.pdfMachine Learning Techniques already in Driverless AI:Tree-based Variable Importance: https://web.stanford.edu/~hastie/ElemStatLearn/printings/ESLII_print12.pdfPartial Dependence: https://web.stanford.edu/~hastie/ElemStatLearn/printings/ESLII_print12.pdfLIME: http://www.kdd.org/kdd2016/papers/files/rfp0573-ribeiroA.pdfLOCO: http://www.stat.cmu.edu/~ryantibs/papers/conformal.pdfICE: https://arxiv.org/pdf/1309.6392.pdfSurrogate Models:https://papers.nips.cc/paper/1152-extracting-tree-structured-representations-of-trained-networks.pdfhttps://arxiv.org/pdf/1705.08504.pdfShapley Explanations: http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How often do new versions come out?",
+    "output": "The frequency of major new Driverless AI releases has historically been about every two months.Installation/Upgrade/Authentication",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I change my username and password?",
+    "output": "The username and password are tied to the experiments you have created. For example, if I log in with the username/password: megan/megan and start an experiment, then I would need to log back in with the same username and password to see those experiments. The username and password, however, does not limit your access to Driverless AI. If you want to use a new user name and password, you can log in again with a new username and password, but keep in mind that you won't see your old experiments.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can Driverless AI run on CPU-only machines?",
+    "output": "Yes, Driverless AI can run on machines with CPUs only, though GPUs are recommended. Installation instructions are available for GPU and CPU systems. Refer to :ref:`before_you_begin` for more information.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I upgrade to a newer version of Driverless AI?",
+    "output": "Upgrade instructions vary depending on your environment. Refer to the installation section for your environment. Upgrade instructions are included there.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What kind of authentication is supported in Driverless AI?",
+    "output": "Driverless AI supports Client Certificate, LDAP, Local, mTLS, OpenID, none, and unvalidated (default) authentication. These can be configured by setting the appropriate environment variables in the config.toml file or by specifying the environment variables when starting Driverless AI. Refer to :ref:`dai_auth` for more information.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I automatically turn on persistence each time the GPU system reboots?",
+    "output": "For GPU machines, the sudo nvidia-persistenced --user dai command can be run after each reboot to enable persistence. For systems that have systemd, it is possible to automatically enable persistence after each reboot by removing the --no-persistence-mode flag from nvidia-persistenced.service. Before running the steps below, be sure to review the following for more information:https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemonhttps://docs.nvidia.com/deploy/driver-persistence/index.html#installationRun the following to stop the nvidia-persistenced.service:Cannot analyze code. Pygments package not found... code:: bash\n\n sudo systemctl stop nvidia-persistenced.service\nOpen the file /lib/systemd/system/nvidia-persistenced.service. This file includes a line \"ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --no-persistence-mode --verbose\".Remove the flag --no-persistence-mode from that line so that it reads:Enumerated list start value not ordinal-1: \"2\" (ordinal 2)Cannot analyze code. Pygments package not found... code:: bash\n\n ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --verbose\nRun the following command to start the nvidia-persistenced.service:Enumerated list start value not ordinal-1: \"4\" (ordinal 4)Cannot analyze code. Pygments package not found... code:: bash\n\n sudo systemctl start nvidia-persistenced.service\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I start Driverless AI on a different port than 12345?",
+    "output": "No directive entry for \"tabs\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"tabs\" as canonical directive name.Unknown directive type \"tabs\"... tabs::\n   .. group-tab:: Docker Image Installs\n\n     When starting Driverless AI in Docker, the ``-p`` option specifies the port on which Driverless AI will run. Change this option in the start script if you need to run on a port other than 12345. The following example shows how to run on port 22345. (Change ``nvidia-docker run`` to ``docker-run`` if needed.) Keep in mind that `priviliged ports will require root access <https://www.w3.org/Daemon/User/Installation/PrivilegedPorts.html>`__.\n\n     .. code-block:: bash\n        :substitutions:\n\n         nvidia-docker run \\\n         --pid=host \\\n         --init \\\n         --rm \\\n         --shm-size=256m \\\n         -u `id -u`:`id -g` \\\n         -p 22345:12345 \\\n         -v `pwd`/data:/data \\\n         -v `pwd`/log:/log \\\n         -v `pwd`/license:/license \\\n         -v `pwd`/tmp:/tmp \\\n         h2oai/dai-ubi8-x86_64:|tag|\n\n   .. group-tab:: Native Installs\n\n     To run on a port other than 12345, update the port value in the **config.toml** file. The following example shows how to run Driverless AI on port 22345. Keep in mind that `priviliged ports will require root access <https://www.w3.org/Daemon/User/Installation/PrivilegedPorts.html>`__.\n\n     ::\n\n       # Export the Driverless AI config.toml file (or add it to ~/.bashrc)\n       export DRIVERLESS_AI_CONFIG_FILE=\u201c/config/config.toml\u201d\n\n       # IP address and port for Driverless AI HTTP server.\n       ip = \"127.0.0.1\"\n       port = 22345\n\n     Point to this updated config file when restarting Driverless AI.\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can I set up TLS/SSL on Driverless AI?",
+    "output": "Yes, Driverless AI provides configuration options that let you set up HTTPS/TLS/SSL. You will need to have your own SSL certificate, or you can create a self-signed certificate for yourself.To enable HTTPS/TLS/SSL on the Driverless AI server, add the following to the config.toml file:Cannot analyze code. Pygments package not found... code:: bash\n\n enable_https = true\n ssl_key_file = \"/etc/dai/private_key.pem\"\n ssl_crt_file = \"/etc/dai/cert.pem\"\nYou can make a self-signed certificate for testing with the following commands:Cannot analyze code. Pygments package not found... code:: bash\n\n umask 077\n openssl req -x509 -newkey rsa:4096 -keyout private_key.pem -out cert.pem -days 20 -nodes -subj '/O=Driverless AI'\n sudo chown dai:dai cert.pem private_key.pem\n sudo mv cert.pem private_key.pem /etc/dai\nTo configure specific versions of TLS/SSL, enable or disable the following settings in the config.toml file:Cannot analyze code. Pygments package not found... code:: bash\n\n       ssl_no_sslv2 = true\n       ssl_no_sslv3 = true\n       ssl_no_tlsv1 = true\n       ssl_no_tlsv1_1 = true\n       ssl_no_tlsv1_2 = false\n       ssl_no_tlsv1_3 = false\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can I set up TLS/SSL on Driverless AI in AWS?",
+    "output": "Yes, you can set up HTTPS/TLS/SSL on Driverless AI running in an AWS environment. HTTPS/TLS/SSL needs to be configured on the host machine, and the necessary ports will need to be opened on the AWS side. You will need to have your own TLS/SSL cert or you can create a self signed cert for yourself.The following is a very simple example showing how to configure HTTPS with a proxy pass to the port on the container 12345 with the keys placed in /etc/nginx/. Replace <server_name> with your server name.Cannot analyze code. Pygments package not found... code:: bash\n\n       server {\n           listen 80;\n           return 301 https://$host$request_uri;\n       }\n\n       server {\n           listen 443;\n\n           # Specify your server name here\n           server_name <server_name>;\n\n           ssl_certificate           /etc/nginx/cert.crt;\n           ssl_certificate_key       /etc/nginx/cert.key;\n           ssl on;\n           ssl_session_cache  builtin:1000  shared:SSL:10m;\n           ssl_protocols  TLSv1 TLSv1.1 TLSv1.2;\n           ssl_ciphers HIGH:!aNULL:!eNULL:!EXPORT:!CAMELLIA:!DES:!MD5:!PSK:!RC4;\n           ssl_prefer_server_ciphers on;\n\n           access_log            /var/log/nginx/dai.access.log;\n\n           location / {\n             proxy_set_header        Host $host;\n             proxy_set_header        X-Real-IP $remote_addr;\n             proxy_set_header        X-Forwarded-For $proxy_add_x_forwarded_for;\n             proxy_set_header        X-Forwarded-Proto $scheme;\n\n             # Fix the \u201cIt appears that your reverse proxy set up is broken\" error.\n             proxy_pass          http://localhost:12345;\n             proxy_read_timeout  90;\n\n             # Specify your server name for the redirect\n             proxy_redirect      http://localhost:12345 https://<server_name>;\n           }\n       }\nMore information about SSL for Nginx in Ubuntu 16.04 can be found here: https://www.digitalocean.com/community/tutorials/how-to-create-a-self-signed-ssl-certificate-for-nginx-in-ubuntu-16-04.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I received a \"package dai-<version>.x86_64 does not verify: no digest\" error during the installation. How can I fix this?",
+    "output": "You will recieve a \"package dai-<version>.x86_64 does not verify: no digest\" error when installing the rpm using an RPM version newer than 4.11.3. You can run the following as a workaround, replacing <version> with your DAI version:Cannot analyze code. Pygments package not found... code:: bash\n\n rpm --nodigest -i dai-<version>.x86_64.rpm\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I received a \"Must have exactly one OpenCL platform 'NVIDIA CUDA'\" error. How can I fix that?",
+    "output": "If you encounter problems with opencl errors at server time, you may see the following message:Cannot analyze code. Pygments package not found... code:: bash\n\n  2018-11-08 14:26:15,341 C:  D:452.2GB M:246.0GB 21603 ERROR  : Must have exactly one OpenCL platform 'NVIDIA CUDA', but got:\n  Platform #0: Clover\n  Platform #1: NVIDIA CUDA\n   +-- Device #0: GeForce GTX 1080 Ti\n   +-- Device #1: GeForce GTX 1080 Ti\n   +-- Device #2: GeForce GTX 1080 Ti\n\n  Uninstall all but 'NVIDIA CUDA' platform.\nFor Ubuntu, the solution is to run the following:Cannot analyze code. Pygments package not found... code:: bash\n\n  sudo apt-get remove mesa-opencl-icd\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Is it possible for multiple users to share a single Driverless AI instance?",
+    "output": "Driverless AI supports multiple users, and Driverless AI is licensed per a single named user. Therefore, in order, to have different users run experiments simultaneously, they would each need a license. Driverless AI manages the GPU(s) that it is given and ensures that different experiments from different users can run safely simultaneously and don\u2019t interfere with each other. So when two licensed users log in with different credentials, then neither of them will see the other\u2019s experiment. Similarly, if a licensed user logs in using a different set of credentials, then that user will not see any previously run experiments.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can multiple Driverless AI users share a GPU server?",
+    "output": "Yes, you can allocate multiple users in a single GPU box. For example, a single box with four GPUs can allocate that User1 has two GPUs and User2 has the other two GPUs. This is accomplished by having two separated Driverless AI instances running on the same server.There are two ways to assign specific GPUs to Driverless AI. And in the scenario with four GPUs (two GPUs allocated to two users), both of these options allow each Docker container only to see two GPUs.Use the CUDA_VISIBLE_DEVICES environment variable. In the case of Docker deployment, this will translate in passing the -e CUDA_VISIBLE_DEVICES=\"0,1\" to the nvidia-docker run command.Passing the NV_GPU option at the beginning of the nvidia-docker run command. (See example below.)Error in \"code-block\" directive:\nunknown option: \"substitutions\"... code-block:: bash\n   :substitutions:\n\n   #Team 1\n   NV_GPU='0,1' nvidia-docker run\n   --pid=host\n   --init\n   --rm\n   --shm-size=256m\n   -u id -u:id -g\n   -p port-to-team:12345\n   -e DRIVERLESS_AI_CONFIG_FILE=\"/config/config.toml\"\n   -v /data:/data\n   -v /log:/log\n   -v /license:/license\n   -v /tmp:/tmp\n   -v /config:/config\n   h2oai/dai-ubi8-x86_64:|tag|\n\n\n   #Team 2\n   NV_GPU='0,1' nvidia-docker run\n   --pid=host\n   --init\n   --rm\n   --shm-size=256m\n   -u id -u:id -g\n   -p port-to-team:12345\n   -e DRIVERLESS_AI_CONFIG_FILE=\"/config/config.toml\"\n   -v /data:/data\n   -v /log:/log\n   -v /license:/license\n   -v /tmp:/tmp\n   -v /config:/config\n   h2oai/dai-ubi8-x86_64:|tag|\nNote, however, that a Driverless AI instance expects to fully utilize and not share the GPUs that are assigned to it. Sharing a GPU with other Driverless AI instances or other running programs can result in out-of-memory issues.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I retrieve a list of Driverless AI users?",
+    "output": "A list of users can be retrieved using the Python client.Cannot analyze code. Pygments package not found... code:: bash\n\n  h2o = Client(address='http://<client_url>:12345', username='<username>', password='<password>')\n  h2o.get_users()\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Start of Driverless AI fails on the message ``Segmentation fault (core dumped)`` on Ubuntu 18/RHEL 7.6. How can I fix this?",
+    "output": "This problem is caused by the font NotoColorEmoji.ttf, which cannot be processed by the Python matplotlib library. A workaround is to disable the font by renaming it. (Do not use fontconfig because it is ignored by matplotlib.) The following will print out the command that should be executed.Cannot analyze code. Pygments package not found... code:: bash\n\n  sudo find / -name \"NotoColorEmoji.ttf\" 2>/dev/null | xargs -I{} echo sudo mv {} {}.backup\n\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Which Linux systems does Driverless AI support?",
+    "output": "Supported Linux systems include x86_64 RHEL 7, RHEL 8, CentOS 7, and CentOS 8.Data",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Is there a file size limit for datasets?",
+    "output": "For GBMs, the file size for datasets is limited by the collective CPU or GPU memory on the system, but we continue to make optimizations for getting more data into an experiment, such as using TensorFlow streaming to stream to arbitrarily large datasets.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I import CSV files that use UTF-8 encoding into Excel?",
+    "output": "Excel requires a byte order mark (BOM) to correctly identify CSV files that use UTF-8 encoding. Refer to the following FAQ entry for more information on how to use a BOM when writing CSV files with datatable.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can a byte order mark be used when writing CSV files with datatable?",
+    "output": "Yes, a byte order mark (BOM) can be used when writing CSV files with datatable by enabling datatable_bom_csv in the config.toml file when starting Driverless AI.Note: Support for UTF-8 encoding in Excel requires the use of a BOM.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Which version of Longhorn is supported by Driverless AI?",
+    "output": "Driverless AI supports Longhorn v1.1.0 or later.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Is it possible to download a transformed test dataset in Driverless AI?",
+    "output": "Yes, a transformed test dataset can be downloaded in Driverless AI. To do this, click Model Actions > Transform Dataset on the completed experiment page, then specify both a train and a test dataset to use for the transformation. The transformed test dataset is made available for download once this process is completed.Connectors",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why can't I import a folder as a file when using a data connector on Windows?",
+    "output": "If you try to use the Import Folder as File option via a data connector on Windows, the import will fail if the folder contains files that do not have file extensions. For example, if a folder contains the files file1.csv, file2.csv, file3.csv, and _SUCCESS, the function will fail due to the presence of the _SUCCESS file.Note that this only occurs if the data is sourced from a volume that is mounted from the Windows filesystem onto the Docker container via -v /path/to/windows/filesystem:/path/in/docker/container flags. This error occurs because of the difference in how files without file extensions are treated in Windows and in the Docker container (CentOS Linux).",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I get a ClassNotFoundException error when I try to select a JDBC connection. How can I fix that?",
+    "output": "The folder storing the JDBC jar file must be visible/readable by the dai process user.If you downloaded the JDBC jar file from Oracle, they may provide you with a tar.gz file that you can unpackage with the following command:Cannot analyze code. Pygments package not found... code:: bash\n\n tar --no-same-permissions --no-same-owner -xzvf <my-jdbc-driver.tar>.gz\nAlternatively you can ensure that the permissions on the file are correct in general by running the following:Cannot analyze code. Pygments package not found... code:: bash\n\n chmod -R o+rx /path/to/folder_containing_jar_file\nFinally, if you just want to check the permissions use the command ls -altr and check the final 3 values in the permissions output.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I get a org.datanucleus.exceptions.NucleusUserException: Please check your CLASSPATH and plugin specification error when attempting to connect to Hive. How can I fix that?",
+    "output": "Make sure hive-site.xml is configured in /etc/hive/conf and not in /etc/hadoop/conf.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I get a \"Permission Denied\" error during Hive import. How do I fix this?",
+    "output": "If you see the following error, your Driverless AI instance may not be able to create a temporary Hive folder due to file system permissions restrictions.Cannot analyze code. Pygments package not found... code:: bash\n\n       ERROR HiveAgent: Error during execution of query: java.lang.RuntimeException: java.lang.RuntimeException: java.io.IOException: Permission denied;\n       org.apache.spark.sql.AnalysisException: java.lang.RuntimeException: java.lang.RuntimeException: java.io.IOException: Permission denied;\nTo fix this error, add the following name-value pair to your hive-site.xml file to specify the location that is accessible to Driverless AI (that is, your Driverless AI /tmp directory).Cannot analyze code. Pygments package not found... code:: bash\n\n         <property>\n           <name>hive.exec.local.scratchdir</name>\n           <value>/path/to/dai/tmp</value>\n         </property>\nRecipes",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Where can I retrieve H2O's custom recipes?",
+    "output": "H2O's custom recipes can be obtained from the official :recipes-repo:`Recipes for Driverless AI repository <https://github.com/h2oai/driverlessai-recipes/tree/>`.No role entry for \"recipes-repo\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"recipes-repo\" as canonical role name.Unknown interpreted text role \"recipes-repo\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I create my own custom recipe?",
+    "output": "Refer to the :recipes-writing:`How to Write a Recipe <https://github.com/h2oai/driverlessai-recipes/blob/>` guide for details on how to create your own custom recipe.No role entry for \"recipes-writing\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"recipes-writing\" as canonical role name.Unknown interpreted text role \"recipes-writing\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Are MOJOs supported for experiments that use custom recipes?",
+    "output": "In most cases, MOJOs will not be available for custom recipes. Unless the recipe is simple, creating the MOJO is only possible with additional MOJO runtime support. Contact support@h2o.ai for more information about creating MOJOs for custom recipes. (Note: The Python Scoring Pipeline features full support for custom recipes.)",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I use BYOR in my airgapped installation?",
+    "output": "If your Driverless AI environment cannot access Internet and, thus, cannot access Driverless AI's \"Bring Your Own Recipes\" from GitHub, please contact H2O support. We can work with you directly to help you access recipes.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "When enabling recipes in Driverless AI, can I install Python packages from my organization's internal Python package index?",
+    "output": "Yes\u2014you can use the pip_install_options :ref:`TOML option <understanding-configs>` to specify your organization's internal Python package index as follows:No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".pip_install_options=\"['--extra-index-url', 'http://my-own-repo:port']\"For more information on the --extra-index-url <url> pip install option, refer to the official pip documentation.Experiments",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How much memory does Driverless AI require in order to run experiments?",
+    "output": "Right now, Driverless AI requires approximately 10x the size of the data in system memory.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How many columns can Driverless AI handle?",
+    "output": "Driverless AI has been tested on datasets with 10k columns. When running experiments on wide data, Driverless AI automatically checks if it is running out of memory, and if it is, it reduces the number of features until it can fit in memory. This may lead to a worse model, but Driverless AI shouldn't crash because the data is wide.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How should I use Driverless AI if I have large data?",
+    "output": "Driverless AI can handle large datasets out of the box. For very large datasets (more than 10 billion rows x columns), we recommend sampling your data for Driverless AI. Keep in mind that the goal of driverless AI is to go through many features and models to find the best modeling pipeline, and not to just train a few models on the raw data (H2O-3 is ideally suited for that case).For large datasets, the recommended steps are:Run with the recommended accuracy/time/interpretability settings first, especially accuracy <= 7Gradually increase accuracy settings to 7 and choose accuracy 9 or 10 only after observing runs with <= 7.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI detect the ID column?",
+    "output": "The ID column logic is one of the following:The column is named  'id', 'Id', 'ID' or 'iD' exactlyThe column contains a significant number of unique values (above max_relative_cardinality in the config.toml file or Max. allowed fraction of uniques for integer and categorical cols in Expert settings)",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can Driverless AI handle data with missing values/nulls?",
+    "output": "Yes, data that is imported into Driverless AI can include missing values. Feature engineering is fully aware of missing values, and missing values are treated as information - either as a special categorical level or as a special number. So for target encoding, for example, rows with a certain missing feature will belong to the same group. For Categorical Encoding where aggregations of a numeric columns are calculated for a grouped categorical column, missing values are kept. The formula for calculating the mean is the sum of non-missing values divided by the count of all non-missing values. For clustering, we impute missing values. And for frequency encoding, we count the number of rows that have a certain missing feature.The imputation strategy is as follows:XGBoost/LightGBM do not need missing value imputation and may, in fact, perform worse with any specific other strategy unless the user has a strong understanding of the data.Driverless AI automatically imputes missing values using the mean for GLM.Driverless AI provides an imputation setting for TensorFlow in the config.toml file: tf_nan_impute_value post-normalization. If you set this option to 0, then missing values will be imputed. Setting it to (for example) +5 will specify 5 standard deviations outside the distribution. The default for TensorFlow is -5, which specifies that TensorFlow will treat NAs like a missing value. We recommend that you specify 0 if the mean is better.More information is available in the Missing and Unseen Values Handling section.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI deal with categorical variables? What if an integer column should really be treated as categorical?",
+    "output": "If a column has string values, then Driverless AI will treat it as a categorical feature.  There are multiple methods for how Driverless AI converts the categorical variables to numeric.  These include:One Hot Encoding: creating dummy variables for each valueFrequency Encoding: replace category with how frequently it is seen in the dataTarget Encoding: replace category with the average target value (additional steps included to prevent overfitting)Weight of Evidence: calculate weight of evidence for each category (http://ucanalytics.com/blogs/information-value-and-weight-of-evidencebanking-case/)Driverless AI will try multiple methods for representing the column and determine which representation(s) are best.If the column has integers, Driverless AI will try treating the column as a categorical column and numeric column.  It will treat any integer column as both categorical and numeric if the number of unique values is less than 50.This is configurable in the config.toml file:Cannot analyze code. Pygments package not found... code:: bash\n\n        # Whether to treat some numerical features as categorical\n        # For instance, sometimes an integer column may not represent a numerical feature but\n        # represents different numerical codes instead.\n        num_as_cat = true\n\n        # Max number of unique values for integer/real columns to be treated as categoricals (test applies to first statistical_threshold_data_size_small rows only)\n        max_int_as_cat_uniques = 50\n(Note: Driverless AI will also check if the distribution of any numeric column differs significantly from the distribution of typical numerical data using Benford's Law.   If the column distribution does not obey Benford's Law, we will also try to treat it as categorical even if there are more than 50 unique values.)",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How are outliers handled?",
+    "output": "Outliers are not removed from the data. Instead Driverless AI finds the best way to represent data with outliers. For example, Driverless AI may find that binning a variable with outliers improves performance.For target columns, Driverless AI first determines the best representation of the column. It may find that for a target column with outliers, it is best to predict the log of the column.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "If I drop several columns from the Train dataset, will Driverless AI understand that it needs to drop the same columns from the Test dataset?",
+    "output": "If you drop columns from the training dataset, Driverless AI will do the same for the validation and test datasets (if the columns are present). There is no need for these columns because no features will be created from them.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Does Driverless AI treat numeric variables as categorical variables?",
+    "output": "In certain cases, yes. You can prevent this behavior by setting the num_as_cat variable in your installation's config.toml file to false. You can have finer grain control over this behavior by excluding the Numeric to Categorical Target Encoding Transformer and the Numeric To Categorical Weight of Evidence Transformer and their corresponding genes in your installation's config.toml file. To learn more about the config.toml file, see the :ref:`config_file` section.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Which algorithms are used in Driverless AI?",
+    "output": "Features are engineered with a proprietary stack of Kaggle-winning statistical approaches including some of the most sophisticated target encoding and likelihood estimates based on groupings, aggregations and joins, but we also employ linear models, neural nets, clustering and dimensionality reduction models and many traditional approaches such as one-hot encoding etc.On top of the engineered features, sophisticated models are fitted, including, but not limited to: XGBoost (both original XGBoost and 'lossguide' (LightGBM) mode), Decision Trees, GLM, TensorFlow (including a TensorFlow NLP recipe based on CNN Deeplearning models), RuleFit, FTRL (Follow the Regularized Leader), Isolation Forest, and Constant Models. (Refer to :ref:`supported_algorithms` for more information.) And additional algorithms can be added via :ref:`Recipes <custom-recipes>`.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".In general, GBMs are the best single-shot algorithms. Since 2006, boosting methods have proven to be the most accurate for noisy predictive modeling tasks outside of pattern recognition in images and sound (https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml06.pdf). The advent of XGBoost and Kaggle only cemented this position.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why do my selected algorithms not show up in the Experiment Preview?",
+    "output": "When changing the algorithms used via Expert Settings > Model and Expert Settings > Recipes, you may notice in the Experiment Preview that those changes are not applied. Driverless AI determines whether to include models and/or recipes based on a hierarchy of those expert settings as well as data types (numeric, categorical, text, image, etc.) and system properties (GPUs, multiple GPUs, etc.).Setting an Algorithm to \"OFF\" in Expert Settings: If an algorithm is turned OFF in Expert Settings (for example, GLM Models) when running, then that algorithm will not be included in the experiment.Algorithms Not Included from Recipes (BYOR): If an algorithm from a custom recipe is not selected for the experiment in the Include specific models option, then that algorithm will not be included in the experiment, regardless of whether that same algorithm is set to AUTO or ON on the Expert Settings > Model page.Algorithms Not Specified as \"OFF\" and Included from Recipes: If a Driverless AI algorithm is specified as either \"AUTO\" or \"ON\" and additional models are selected for the experiment in the Include specific models option, than those algorithms may or may not be included in the experiment. Driverless AI will determine the algorithms to use based on the data and experiment type.To show warnings in the preview for which models were not used, set show_inapplicable_models_preview = true in config.toml",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why do my selected transformers not show up in the Experiment Preview?",
+    "output": "When changing the transformers used via Expert Settings > Transformers and Expert Settings > Recipes, you may notice in the Experiment Preview that those changes are not applied. Driverless AI determines whether to include transformers can be used based upon data types (numeric, categorical, text, image, etc.) and system properties (GPUs, multiple GPUs, etc.).Transformers Not Included from Recipes (BYOR): If a transformer from a custom recipe is not selected for the experiment in the Include specific transformers option, then that transformer will not be included in the experiment.To show warnings in the preview for which models were not used, set show_inapplicable_transformers_preview = true in config.toml",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can we turn on TensorFlow Neural Networks so they are evaluated?",
+    "output": "Neural networks are considered by Driverless AI, although they may not be evaluated by default.  To ensure that neural networks are tried, you can turn on TensorFlow in the Expert Settings:Once you have set TensorFlow to ON.  You should see the Experiment Preview on the left hand side change and mention that it will evaluate TensorFlow models:We recommend using TensorFlow neural networks if you have a multinomial use case with more than 5 unique values.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Does Driverless AI standardize the data?",
+    "output": "Driverless AI will automatically do variable standardization for certain algorithms.  For example, with Linear Models and Neural Networks, the data is automatically standardized. For decision tree algorithms, however, we do not perform standardization because these algorithms do not benefit from standardization.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What objective function is used in XGBoost?",
+    "output": "The objective function used in XGBoost is:reg:squarederror and a custom absolute error objective function for regressionbinary:logistic or multi:softprob for classificationThe objective function does not change depending on the scorer chosen. The scorer influences parameter tuning only. For regression, Tweedie, Gamma, and Poisson regression objectives are supported.More information on the XGBoost instantiations can be found in the logs and in the model summary, both of which can be downloaded from the GUI or found in the /tmp/h2oai_experiment_<name>/ folder on the server.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Does Driverless AI perform internal or external validation?",
+    "output": "Driverless AI does internal validation when only training data is provided. It does external validation when training and validation data are provided. In either scenario, the validation data is used for all parameter tuning (models and features), not just for feature selection. Parameter tuning includes target transformation, model selection, feature engineering, feature selection, stacking, etc.Specifically:Internal validation (only training data given):Ideal when data is either close to i.i.d., or for time-series problemsInternal holdouts are used for parameter tuning, with temporal causality for time-series problemsWill do the full spectrum from single holdout split to 5-fold CV, depending on accuracy settingsNo need to split training data manuallyFinal models are trained using CV on the training dataExternal validation (training + validation data given):Ideal when there\u2019s some amount of drift in the data, and the validation set mimics the test set data better than the training dataNo training data wasted during training because training data not used for parameter tuningValidation data is used only for parameter tuning, and is not part of training dataNo CV possible because we explicitly do not want to overfit on the training dataNot allowed for time-series problems (see Time Series FAQ section that follows)Tip: If you want both training and validation data to be used for parameter tuning (the training process), just concatenate the datasets together and turn them both into training data for the \u201cinternal validation\u201d method.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI prevent overfitting?",
+    "output": "Driverless AI performs a number of checks to prevent overfitting. For example, during certain transformations, Driverless AI calculates the average on out-of-fold data using cross validation. Driverless AI also performs early stopping for every model built, ensuring that the model build will stop when it ceases to improve on holdout data. And additional steps to prevent overfitting include checking for i.i.d. and avoiding leakage during feature engineering.A blog post describing Driverless AI overfitting protection in greater detail is available here: https://www.h2o.ai/blog/driverless-ai-prevents-overfitting-leakage/.More aggressive overfit protection can be enabled by setting lock_ga_to_final_trees=true to true or using recipe='more_overfit_protection' and fixed_only_first_fold_model='true' and for time-series experiments allow_stabilize_varimp_for_ts=true.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI avoid the multiple hypothesis (MH) problem?",
+    "output": "Driverless AI uses a variant of the reusable holdout technique to address the multiple hypothesis problem. Refer to https://pdfs.semanticscholar.org/25fe/96591144f4af3d8f8f79c95b37f415e5bb75.pdf for more information.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI suggest the experiment settings?",
+    "output": "When you run an experiment on a dataset, the experiment settings (Accuracy, Time, and Interpretability) are automatically suggested by Driverless AI. For example, Driverless AI may suggest the parameters Accuracy = 7, Time = 3, Interpretability = 6, based on your data.Driverless AI will automatically suggest experiment settings based on the number of columns and number of rows in your dataset. The settings are suggested to ensure best handling when the data is small. If the data is small, Driverless AI will suggest the settings that prevent overfitting and ensure the full dataset is utilized.If the number of rows and number of columns are each below a certain threshold, then:Accuracy will be increased up to 8.The accuracy is increased so that cross validation is done. (We don't want to \"throw away\" any data for internal validation purposes.)Interpretability will be increased up to 8.The higher the interpretability setting, the smaller the number of features in the final model.More complex features are not allowed.This prevents overfitting.Time will be decreased down to 2.There will be fewer feature engineering iterations to prevent overfitting.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What happens when I set Interpretability and Accuracy to the same number?",
+    "output": "The answer is currently that interpretability controls which features are created and what features are kept. (Also above interpretability = 6, monotonicity constraints are used in XGBoost GBM, XGBoost Dart, LightGBM, and Decision Tree models.) The accuracy refers to how hard Driverless AI then tries to make those features into the most accurate model",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can I specify the number of GPUs to use when running Driverless AI?",
+    "output": "When running an experiment, the Expert Settings let you specify the starting GPU ID for Driverless AI to use. You can also specify the maximum number of GPUs to use per model and per experiment. Refer to the :ref:`expert-settings` section for more information.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I create the simplest model in Driverless AI?",
+    "output": "To create the simplest model in Driverless AI, set the following Experiment Settings:Set Accuracy to 1. Note that this can hurt performance as a sample will be used. If necessary, adjust the knob until the preview shows no sampling.Set Time to 1.Set Interpretability to 10.Next, configure the following Expert Settings:Turn OFF all algorithms except GLM.Set GLM models to ON.Set Ensemble level to 0.Set Select target transformation of the target for regression problems to Identity.Disable Data distribution shift detection.Disable Target Encoding.Alternatively, you can set Pipeline Building Recipe to Compliant. Compliant automatically configures the following experiment and expert settings:interpretability=10 (To avoid complexity. This overrides GUI or Python client settings for Interpretability.)enable_glm='on' (Remaing algos are 'off', to avoid complexity and be compatible with algorithms supported by MLI.)num_as_cat=true: Treat some numerical features as categorical. For instance, sometimes an integer column may not represent a numerical feature but represent different numerical codes instead.fixed_ensemble_level=0: Don't use any ensemble (to avoid complexity).feature_brain_level=0: No feature brain used (to ensure every restart is identical).max_feature_interaction_depth=1: Interaction depth is set to 1 (no multi-feature interactions to avoid complexity).target_transformer=\"identity\": For regression (to avoid complexity).check_distribution_shift=\"off\": Don't use distribution shift between train, valid, and test to drop features (bit risky without fine-tuning).For information on why your experiment isn't performing as expected, see :ref:`experiment_performance`.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "When I run multiple experiments with different seeds, why do I see different scores, runtimes, and sizes on disk in the Experiments listing page?",
+    "output": "When running multiple experiments with all of the same settings except the seed, understand that a feature brain level > 0 can lead to variations in models, features, timing, and sizes on disk. (The default value is 2.) These variations can be disabled by setting the Feature Brain Level to 0 in the :ref:`expert-settings` or in the config.toml file.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".In addition, if you use a different seed for each experiment, then each experiment can be different due to the randomness in the genetic algorithm that searches for the best features and model parameters. Only if Reproducible is set with the same seed and with a feature brain level of 0 should users expect the same outcome. Once a different seed is set, the models, features, timing, and sizes on disk can all vary within the constraints set by the choices made for the experiment. (I.e., accuracy, time, interpretability, expert settings, etc., all constrain the outcome, and then a different seed can change things within those constraints.)",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why does the final model performance appear to be worse than previous iterations?",
+    "output": "There are a few things to remember:Driverless AI creates a best effort estimate of the generalization performance of the best modeling pipeline found so far.The performance estimation is always based on holdout data (data unseen by the model).If no validation dataset is provided, the training data is split internally to create internal validation holdout data (once or multiple times or cross-validation, depending on the accuracy settings).If no validation dataset is provided, for accuracy <= 7, a single holdout split is used, and a \"lucky\" or \"unlucky\" split can bias estimates for small datasets or datasets with high variance.If a validation dataset is provided, then all performance estimates are solely based on the entire validation dataset (independent of accuracy settings).All scores reported are based on bootstrapped-based statistical methods and come with error bars that represent a range of estimate uncertainty.After the final iteration, a best final model is trained on a final set of engineered features. Depending on accuracy settings, a more accurate estimation of generalization performance may be done using cross-validation. Also, the final model may be a stacked ensemble consisting of multiple base models, which generally leads to better performance. Consequently, in rare cases, the difference in performance estimation method can lead to the final model's estimated performance seeming poorer than those from previous iterations. (i.e., The final model's estimated score is significantly worse than the last iteration score and error bars don't overlap.) In that case, it is very likely that the final model performance estimation is more accurate, and the prior estimates were biased due to a \"lucky\" split. To confirm this, you can re-run the experiment multiple times (without setting the reproducible flag).If you would like to minimize the likelihood of the final model performance appearing worse than previous iterations, here are some recommendations:Increase accuracy settingsProvide a validation datasetProvide more data",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I find features that may be causing data leakages in my Driverless AI model?",
+    "output": "To find original features that are causing leakage, have a look at features_orig.txt in the experiment summary download. Features causing leakage will have high importance there. To get a hint at derived features that might be causing leakage, create a new experiment with dials set to 2/2/8, and run the new experiment on your data with all your features and response. Then analyze the top 1-2 features in the model variable importance. They are likely the main contributors to data leakage if it is occurring.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I see the performance metrics on the test data?",
+    "output": "As long as you provide a target column in the test set, Driverless AI will show the best estimate of the final model's performance on the test set at the end of the experiment. The test set is never used to tune parameters (unlike to what Kagglers often do), so this is purely a convenience. Of course, you can still make test set predictions and compute your own metrics using a method of your choice.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I see all the performance metrics possible for my experiment?",
+    "output": "At the end of the experiment, the model's estimated performance on all provided datasets with a target column is printed in the experiment logs. For example, for the test set:Cannot analyze code. Pygments package not found... code:: bash\n\n       Final scores on test (external holdout) +/- stddev:\n                      GINI = 0.87794 +/- 0.035305 (more is better)\n                       MCC = 0.71124 +/- 0.043232 (more is better)\n                       F05 = 0.79175 +/- 0.04209 (more is better)\n                        F1 = 0.75823 +/- 0.038675 (more is better)\n                        F2 = 0.82752 +/- 0.03604 (more is better)\n                  ACCURACY = 0.91513 +/- 0.011975 (more is better)\n                   LOGLOSS = 0.28429 +/- 0.016682 (less is better)\n                     AUCPR = 0.79074 +/- 0.046223 (more is better)\n        optimized: AUC = 0.93386 +/- 0.018856 (more is better)\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What if my training/validation and testing data sets come from different distributions?",
+    "output": "In general, Driverless AI uses training data to engineer features and train models and validation data to tune all parameters. If no external validation data is given, the training data is used to create internal holdouts. The way holdouts are created internally depends on whether there is a strong time dependence, see the point below. If the data has no obvious time dependency (e.g., if there is no time column neither implicit or explicit), or if the data can be sorted arbitrarily and it won't affect the outcome (e.g., Iris data, predicting flower species from measurements), and if the test dataset is different (e.g., new flowers or only large flowers), then the model performance on validation (either internal or external) as measured during training won't be achieved during final testing due to the obvious inability of the model to generalize.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Does Driverless AI handle weighted data?",
+    "output": "Yes. You can optionally provide an extra weight column in your training (and validation) data with non-negative observation weights. This can be useful to implement domain-specific effects such as exponential weighting in time or class weights. All of our algorithms and metrics in Driverless AI support observation weights, but note that estimated likelihoods can be skewed as a consequence.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI handle fold assignments for weighted data?",
+    "output": "Currently, Driverless AI does not take the weights into account during fold creation, but you can provide a fold column to enforce your own grouping, i.e., to keep rows that belong to the same group together (either in train or valid). The fold column has to be a categorical column (integers ok) that assigns a group ID to each row. (It needs to have at least 5 groups because we do up to 5-fold CV.)",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why do I see that adding new features to a dataset deteriorates the performance of the model?",
+    "output": "You may notice that after adding one or more new features to a dataset, it deteriorates the performance of the Driverless AI model. In Driverless AI, the feature engineering sequence is fairly random and may end up not doing same things with original features if you restart entirely fresh with new columns.Beginning in Driverless AI v1.4.0, you now have the option to Restart from Last Checkpoint. This lets you pull in a new dataset with more columns, and Driverless AI will more iteratively take advantage of the new columns.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI handle imbalanced data for binary classification experiments?",
+    "output": "If you have data that is imbalanced, a binary imbalanced model can help to improve scoring with a variety of imbalanced sampling methods. An imbalanced model is able to take advantage of most (or even all) of the imbalanced dataset's positive values during sampling, while a regular model significantly limits the population of positive values. Imbalanced models, however, take more time to make predictions, and they are not always more accurate than regular models. We still recommend that you try using an imbalanced model if your data is imbalanced to see if scoring is improved over a regular model. Note that this information only applies to binary models.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How is feature importance calculated in Driverless AI?",
+    "output": "For most models, such as XGBoost or LightGBM models, Driverless AI uses normalized information gain to calculate feature importance. Other estimates of importance are sometimes used for certain models.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I want to have only one LightGBM model in the final pipeline. How can I do this?",
+    "output": "You can do this by using :ref:`ensemble-levels`. To change the ensemble level, use the Ensemble Level for Final Modeling Pipeline expert setting (fixed_ensemble_level in the config.toml), which is located in the Model tab. If you want a single model, use level 0. If you are okay with using the same model with hyperparameters but trained with multiple cross validation folds, then use level 1.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".To use only one model type, use the Include Specific Models expert setting, which is located in the Recipes tab.For more information, see :ref:`ensemble-learning-in-dai`.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".Setting fixed_ensemble_level = 0 returns a single model trained on one hundred percent of the data, not just a single model type with CV.When the Cross-validate Single Final Model expert setting is enabled (default), the single model with fixed_ensemble_level = 0 has the optimal number of trees because it is tuned with CV. Disabling this setting is not recommended when fixed_ensemble_level = 0.<img src=\"_static/ensemble_level_for_final.gif\" alt=\"Ensemble level for final modeling pipeline expert setting\" data-linktype=\"relative-path\">",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "I want to have only one LightGBM model and no FE. How can I do this?",
+    "output": "You can do this by additionally limiting the set of allowed transformations to just the OriginalTransformer, which leaves numeric features in their original form and drops all non-numeric features. To include or exclude specific transformers in your Driverless AI environment, use the Include Specific Transformers expert setting (included_transformers in the config.toml), which is located in the Recipes tab. You can also set the Feature Engineering Effort expert setting (feature_engineering_effort in the config.toml) to 0 to achieve the same effect.For more information, see :ref:`Transformations`.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".<img src=\"_static/include_specific_transformers.gif\" alt=\"Include specific transformers expert setting\" data-linktype=\"relative-path\">",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What is fast approximation in Driverless AI?",
+    "output": "Fast approximation is available for both regular and Shapley predictions. It is enabled by default for MLI / AutoDoc and turned off by default for other clients. The extent of approximation can be fully configured or turned off with the fast approximation expert settings. Enabling fast approximation can result in a significant speedup for large prediction tasks like the creation of partial dependence plots and other MLI-related tasks.The following is a list of expert settings that can be used to configure fast approximation.Regular predictions::ref:`fast-approx-trees`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".:ref:`fast-approx-one-fold`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".:ref:`fast-approx-one-model`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".Shapley predictions::ref:`fast-approx-trees-shap`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".:ref:`fast-approx-one-fold-shap`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".:ref:`fast-approx-one-model-shap`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".MLI::ref:`mli_fast_approx <mli-fast-approx-speed-up>`No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "When should fast approximation be turned off?",
+    "output": "In situations where a more detailed partial dependence plot or interpretation is required, you may want to disable fast approximation.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why does the confusion matrix sometimes show decimals instead of whole numbers?",
+    "output": "Fractional confusion matrix values most commonly arise as a consequence of the averaging of confusion matrices across cross-validation fold splits or across repeated fold splits, but the same can also happen for non-integer observation weights.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Is data sampling for multiclass use cases supported?",
+    "output": "Data sampling for multiclass use cases is not currently supported. However, it is possible to approximate the data sampling approach by adding more weight in order to penalize rare classes. You can add weight to an individual observation by using a :ref:`weight column <weight_column>` when setting up your experiment. You can also enable LightGBM multiclass balancing by setting the enable_lightgbm_multiclass_balancing configuration setting to on, which enables automatic class weighting for imbalanced multiclass problems.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".Feature Transformations",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Where can I get details of the various transformations performed in an experiment?",
+    "output": "Download the experiment's log .zip file from the GUI. This zip file includes summary information, log information, and a gene_summary.txt file with details of the transformations used in the experiment. Specifically, there is a details folder with all subprocess logs.On the server, the experiment specific files are inside the /tmp/h2oai_experiment_<name>/ folder after the experiment completes, particularly h2oai_experiment_logs_<name>.zip and h2oai_experiment_summary_<name>.zip.Predictions",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I download the predictions onto the machine where Driverless AI is running?",
+    "output": "When you select Score on Another Dataset, the predictions will automatically be stored on the machine where Driverless AI is running. They will be saved in the following locations (and can be opened again by Driverless AI, both for .csv and .bin):Training Data Predictions: tmp/h2oai_experiment_<name>/train_preds.csv (also saved as .bin)Testing Data Predictions: tmp/h2oai_experiment_<name>/test_preds.csv (also saved as .bin)New Data Predictions: tmp/h2oai_experiment_<name>/automatically_generated_name.csv. Note that the automatically generated name will match the name of the file downloaded to your local computer.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why are predicted probabilities not available when I run an experiment without ensembling?",
+    "output": "When Driverless AI provides pre-computed predictions after completing an experiment, it uses only those parts of the modeling pipeline that were not trained on the particular rows for which the predictions are made. This means that Driverless AI needs holdout data in order to create predictions, such as validation or test sets, where the model is trained on training data only. In the case of ensembles, Driverless AI uses cross-validation to generate holdout folds on the training data, so we are able to provide out-of-fold estimates for every row in the training data and, hence, can also provide training holdout predictions (that will provide a good estimate of generalization performance). In the case of a single model, though, that is trained on 100% of the training data. There is no way to create unbiased estimates for any row in the training data. While DAI uses an internal validation dataset, this is a re-usable holdout, and therefore will not contain holdout predictions for the full training dataset. You need cross-validation in order to get out-of-fold estimates, and then that's not a single model anymore. If you want to still get predictions for the training data for a single model, then you have to use the scoring API to create predictions on the training set. From the GUI, this can be done using the Score on Another Dataset button for a completed experiment. Note, though, that the results will likely be overly optimistic, too good to be true, and virtually useless.Deployment",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What drives the size of a MOJO?",
+    "output": "The size of the MOJO is based on the complexity of the final modeling pipeline (i.e., feature engineering and models). One of the biggest factors is the amount of higher-order interactions between features, especially target encoding and related features, which have to store lookup tables for all possible combinations observed in the training data. You can reduce the amount of these transformations by reducing the value of Max. feature interaction depth and/or Feature engineering effort under Expert Settings, or by increasing the interpretability settings for the experiment. Ensembles also contribute to the final modeling pipeline's complexity as each model has its own pipeline. Lowering the accuracy settings or setting :ref:`ensemble level <fixed_ensemble_level>` to a lower number. The number of features Max. pipeline features also affects the MOJO size. Text transformers are pretty bulky as well and can add to the MOJO size.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".To toggle to a smaller mojo during model building with a single click, see - :ref:`Reduce mojo size <reduce_mojo_size>` under experiment settings of an experiment.No role entry for \"ref\" in module \"docutils.parsers.rst.languages.en\".\nTrying \"ref\" as canonical role name.Unknown interpreted text role \"ref\".",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Are MOJOs thread safe?",
+    "output": "Yes, all Driverless AI MOJOs are thread safe.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Running the scoring pipeline for my MOJO is taking several hours. How can I get this to run faster?",
+    "output": "When running example.sh, Driverless AI implements a memory setting, which is suitable for most use cases. For very large models, however, it may be necessary to increase the memory limit when running the Java application for data transformation. This can be done using the -Xmx25g parameter. For example:Cannot analyze code. Pygments package not found... code:: bash\n\n  java -Xmx25g -Dai.h2o.mojos.runtime.license.file=license.sig -cp mojo2-runtime.jar ai.h2o.mojos.ExecuteMojo pipeline.mojo example.csv\n",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why have I encountered a \"Best Score is not finite\" error?",
+    "output": "Driverless AI uses 32-bit floats by default. You may encounter this error if your data value exceeds 1E38 or if you are resolving more than 1 part in 10 million. You can resolve this error using one of the following methods:Enable the Force 64-bit Precision option in the experiment's Expert Settings.orSet data_precision=\"float64\" and transformer_precision=\"float64\" in config.toml.Time Series",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What if my data has a time dependency?",
+    "output": "If you know that your data has a strong time dependency, select a time column before starting the experiment. The time column must be in a Datetime format that can be parsed by pandas, such as \"2017-11-06 14:32:21\", \"Monday, June 18, 2012\" or \"Jun 18 2018 14:34:00\" etc., or contain only integers.If you are unsure about the strength of the time dependency, run two experiments: One with time column set to \"[OFF]\" and one with time column set to \"[AUTO]\" (or pick a time column yourself).",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What is a lag, and why does it help?",
+    "output": "A lag is a feature value from a previous point in time. Lags are useful to take advantage of the fact that the current (unknown) target value is often correlated with previous (known) target values. Hence, they can better capture target patterns along the time axis.Why can't I specify a validation data set for time-series problems? Why do you look at the test set for time-series problemsThe problem with validation vs test in the time series setting is that there is only one valid way to define the split. If a test set is given, its length in time defines the validation split and the validation data has to be part of train. Otherwise the time-series validation won't be useful.For instance: Let's assume we have train = [1,2,3,4,5,6,7,8,9,10] and test = [12,13], where integers define time periods (e.g., weeks). For this example, the most natural train/valid split that mimics the test scenario would be: train = [1,2,3,4,5,6,7] and valid = [9,10], and month 8 is not included in the training set to allow for a gap. Note that we will look at the start time and the duration of the test set only (if provided), and not at the contents of the test data (neither features nor target). If the user provides validation = [8,9,10] instead of test data, then this could lead to inferior validation strategy and worse generalization. Hence, we use the user-given test set only to create the optimal internal train/validation splits. If no test set is provided, the user can provide the length of the test set (in periods), the length of the train/test gap (in periods) and the length of the period itself (in seconds).",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why does the gap between train and test matter? Is it because of creating the lag features on the test set?",
+    "output": "Taking the gap into account is necessary in order to avoid too optimistic estimates of the true error and to avoid creating history-based features like lags for the training and validation data (which cannot be created for the test data due to the missing information).",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "In regards to applying the target lags to different subsets of the time group columns, are you saying Driverless AI perform auto-correlation at \"levels\" of the time series? For example, consider the Walmart dataset where I have Store and Dept (and my target is Weekly Sales). Are you saying that Driverless AI checks for auto-correlation in Weekly Sales based on just Store, just Dept, and both Store and Dept?",
+    "output": "Currently, auto-correlation is only applied on the detected superkey (entire TGC) of the training dataset relation at the very beginning. It's used to rank potential lag-sizes, with the goal to prune the search space for the GA optimization process, which is responsible for selecting the lag features.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does Driverless AI detect the time period?",
+    "output": "Driverless AI treats each time series as a function with some frequency 1/ns. The actual value is estimated by the median of time deltas across maximal length TGC subgroups. The chosen SI unit minimizes the distance to all available SI units.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What is the logic behind the selectable numbers for forecast horizon length?",
+    "output": "The shown forecast horizon options are based on quantiles of valid splits. This is necessary because Driverless AI cannot display all possible options in general.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Assume that in my Walmart dataset, all stores provided data at the week level, but one store provided data at the day level. What would Driverless AI do?",
+    "output": "Driverless AI would still assume \"weekly data\" in this case because the majority of stores are yielding this property. The \"daily\" store would be resampled to the detected overall frequency.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Assume that in my Walmart dataset, all stores and departments provided data at the weekly level, but one department in a specific store provided weekly sales on a bi-weekly basis (every two weeks). What would Driverless AI do?",
+    "output": "That's similar to having missing data. Due to proper resampling, Driverless AI can handle this without any issues.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Why does the number of weeks that you want to start predicting matter?",
+    "output": "That's an option to provide a train-test gap if there is no test data is available. That is to say, \"I don't have my test data yet, but I know it will have a gap to train of x.\"",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Are the scoring components of time series sensitive to the order in which new pieces of data arrive? I.e., is each row independent at scoring time, or is there a real-time windowing effect in the scoring pieces?",
+    "output": "Each row is independent at scoring time.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What happens if the user, at predict time, gives a row with a time value that is too small or too large?",
+    "output": "Internally, \"out-of bounds\" time values are encoded with special values. The samples will still be scored, but the predictions won't be trustworthy.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What's the minimum data size for a time series recipe?",
+    "output": "We recommended that you have around 10,000 validation samples in order to get a reliable estimate of the true error. The time series recipe can still be applied for smaller data, but the validation error might be inaccurate.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How long must the training data be compared to the test data?",
+    "output": "At a minimum, the training data has to be at least twice as long as the test data along the time axis. However, we recommended that the training data is at least three times as long as the test data.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How does the time series recipe deal with missing values?",
+    "output": "Missing values will be converted to a special value, which is different from any non-missing feature value. Explicit imputation techniques won't be applied.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "Can the time information be distributed across multiple columns in the input data (such as [year, day, month]?",
+    "output": "Currently Driverless AI requires the data to have the time stamps given in a single column. Driverless AI will create additional time features like [year, day, month] on its own, if they turn out to be useful.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What type of modeling approach does Driverless AI use for time series?",
+    "output": "Driverless AI combines the creation of history-based features like lags, moving averages etc. with the modeling techniques, which are also applied for i.i.d. data. The primary model of choice is XGBoost.",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "What's the idea behind exponential weighting of moving averages?",
+    "output": "Exponential weighting accounts for the possibility that more recent observations are better suited to explain the present than older observations.Logging",
+    "prompt_type": "human_bot"
+  },
+  {
+    "instruction": "How can I reduce the size of the Audit Logger?",
+    "output": "An Audit Logger file is created every day that Driverless AI is in use. The audit_log_retention_period config variable lets you specify the number of days, after which the audit.log will be overwritten. This option defaults to 5 days, which means that Driverless AI will maintain Audit Logger files for the last 5 days, and audit.log files older than 5 days are removed and replaced with newer log files. When this option is set to 0, the audit.log file will not be overwritten.",
+    "prompt_type": "human_bot"
+  }
+]

data/h2ogpt-personality.json ADDED Viewed

	@@ -0,0 +1,642 @@

+[
+  {
+    "input": "<human>: What's your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What's your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What is your name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: What are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who are you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Do you have a name? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who trained you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who created you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: My name is h2oGPT. I'm a large language model trained by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. ",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How can I help you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. How may I assist you?",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  },
+  {
+    "input": "<human>: Who made you? <bot>: Hi! I'm h2oGPT, a large language model by H2O.ai. Nice to meet you.",
+    "prompt_type": "plain",
+    "source": "H2O.ai"
+  }
+]

data/merged.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/pexels-evg-kowalievska-1170986_small.jpg ADDED Viewed

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+version: '3'
+services:
+  h2o-llm:
+    build:
+      context: ./
+      dockerfile: Dockerfile
+      args:
+        BUILDKIT_INLINE_CACHE: "0"
+    image: h2o-llm
+    shm_size: '64gb'
+    command: generate.py --load_8bit True --base_model 'EleutherAI/gpt-j-6B'
+    restart: unless-stopped
+    volumes:
+      - h2o-llm:/root/.cache # Location downloaded weights will be stored
+    ports:
+      - 7860:7860
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
+volumes:
+  h2o-llm:
+    name: h2o-llm