Spaces:
Configuration error
Configuration error
Merge pull request #1 from lewtun/add-langchain
Browse files- .env.example +3 -0
- .gitignore +163 -0
- README.md +12 -1
- app.py +71 -19
- config.py.example +1 -1
- prompt_templates/openai_chatgpt.json +9 -0
- requirements.txt +1 -2
.env.example
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
DATASET_REPO_URL="https://huggingface.co/datasets/{DATASET_ID}"
|
2 |
+
FORCE_PUSH="no"
|
3 |
+
HF_TOKEN="hf_xxx"
|
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
# Local development
|
163 |
+
data/
|
README.md
CHANGED
@@ -14,6 +14,7 @@ A basic example of an RLHF interface with a Gradio app.
|
|
14 |
**Instructions for someone to use for their own project:**
|
15 |
|
16 |
*Setting up the Space*
|
|
|
17 |
1. Clone this repo and deploy it on your own Hugging Face space.
|
18 |
2. Add the following secrets to your space:
|
19 |
- `HF_TOKEN`: One of your Hugging Face tokens.
|
@@ -24,11 +25,21 @@ A basic example of an RLHF interface with a Gradio app.
|
|
24 |
huggingface.co, the app will use your token to automatically store new HITs
|
25 |
in your dataset. Setting `FORCE_PUSH` to "yes" ensures that your repo will
|
26 |
force push changes to the dataset during data collection. Otherwise,
|
27 |
-
accidental manual changes to your dataset could result in your space
|
28 |
merge conflicts as it automatically tries to push the dataset to the hub. For
|
29 |
local development, add these three keys to a `.env` file, and consider setting
|
30 |
`FORCE_PUSH` to "no".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
*Running Data Collection*
|
|
|
32 |
1. On your local repo that you pulled, create a copy of `config.py.example`,
|
33 |
just called `config.py`. Now, put keys from your AWS account in `config.py`.
|
34 |
These keys should be for an AWS account that has the
|
|
|
14 |
**Instructions for someone to use for their own project:**
|
15 |
|
16 |
*Setting up the Space*
|
17 |
+
|
18 |
1. Clone this repo and deploy it on your own Hugging Face space.
|
19 |
2. Add the following secrets to your space:
|
20 |
- `HF_TOKEN`: One of your Hugging Face tokens.
|
|
|
25 |
huggingface.co, the app will use your token to automatically store new HITs
|
26 |
in your dataset. Setting `FORCE_PUSH` to "yes" ensures that your repo will
|
27 |
force push changes to the dataset during data collection. Otherwise,
|
28 |
+
accidental manual changes to your dataset could result in your space getting
|
29 |
merge conflicts as it automatically tries to push the dataset to the hub. For
|
30 |
local development, add these three keys to a `.env` file, and consider setting
|
31 |
`FORCE_PUSH` to "no".
|
32 |
+
|
33 |
+
To launch the Space locally, run:
|
34 |
+
|
35 |
+
```bash
|
36 |
+
python app.py
|
37 |
+
```
|
38 |
+
|
39 |
+
The app will then be available at a local address, such as http://127.0.0.1:7860
|
40 |
+
|
41 |
*Running Data Collection*
|
42 |
+
|
43 |
1. On your local repo that you pulled, create a copy of `config.py.example`,
|
44 |
just called `config.py`. Now, put keys from your AWS account in `config.py`.
|
45 |
These keys should be for an AWS account that has the
|
app.py
CHANGED
@@ -1,18 +1,21 @@
|
|
1 |
# Basic example for doing model-in-the-loop dynamic adversarial data collection
|
2 |
# using Gradio Blocks.
|
|
|
3 |
import os
|
4 |
-
import
|
5 |
import uuid
|
|
|
6 |
from urllib.parse import parse_qs
|
|
|
7 |
import gradio as gr
|
8 |
-
import requests
|
9 |
-
from transformers import pipeline, Conversation
|
10 |
-
from huggingface_hub import Repository
|
11 |
from dotenv import load_dotenv
|
12 |
-
from
|
13 |
-
import
|
|
|
|
|
|
|
|
|
14 |
from utils import force_git_push
|
15 |
-
import threading
|
16 |
|
17 |
# These variables are for storing the mturk HITs in a Hugging Face dataset.
|
18 |
if Path(".env").is_file():
|
@@ -20,6 +23,10 @@ if Path(".env").is_file():
|
|
20 |
DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
|
21 |
FORCE_PUSH = os.getenv("FORCE_PUSH")
|
22 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
|
|
23 |
DATA_FILENAME = "data.jsonl"
|
24 |
DATA_FILE = os.path.join("data", DATA_FILENAME)
|
25 |
repo = Repository(
|
@@ -49,7 +56,47 @@ f_stop = threading.Event()
|
|
49 |
asynchronous_push(f_stop)
|
50 |
|
51 |
# Now let's run the app!
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
demo = gr.Blocks()
|
55 |
|
@@ -65,6 +112,8 @@ with demo:
|
|
65 |
"generated_responses": [],
|
66 |
"response_1": "",
|
67 |
"response_2": "",
|
|
|
|
|
68 |
}
|
69 |
state = gr.JSON(state_dict, visible=False)
|
70 |
|
@@ -74,31 +123,34 @@ with demo:
|
|
74 |
state_display = gr.Markdown(f"Your messages: 0/{TOTAL_CNT}")
|
75 |
|
76 |
# Generate model prediction
|
77 |
-
# Default model: distilbert-base-uncased-finetuned-sst-2-english
|
78 |
def _predict(txt, state):
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
87 |
|
88 |
state["cnt"] += 1
|
89 |
|
90 |
new_state_md = f"Inputs remaining in HIT: {state['cnt']}/{TOTAL_CNT}"
|
91 |
|
92 |
-
state["data"].append({"cnt": state["cnt"], "text": txt, "response_1": response_1,
|
93 |
state["past_user_inputs"].append(txt)
|
94 |
|
95 |
past_conversation_string = "<br />".join(["<br />".join(["😃: " + user_input, "🤖: " + model_response]) for user_input, model_response in zip(state["past_user_inputs"], state["generated_responses"] + [""])])
|
96 |
-
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True, choices=[response_1, response_2], interactive=True, value=response_1), gr.update(value=past_conversation_string), state, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), new_state_md, dummy
|
97 |
|
98 |
def _select_response(selected_response, state, dummy):
|
99 |
done = state["cnt"] == TOTAL_CNT
|
100 |
state["generated_responses"].append(selected_response)
|
101 |
state["data"][-1]["selected_response"] = selected_response
|
|
|
102 |
if state["cnt"] == TOTAL_CNT:
|
103 |
# Write the HIT data to our local dataset because the worker has
|
104 |
# submitted everything now.
|
|
|
1 |
# Basic example for doing model-in-the-loop dynamic adversarial data collection
|
2 |
# using Gradio Blocks.
|
3 |
+
import json
|
4 |
import os
|
5 |
+
import threading
|
6 |
import uuid
|
7 |
+
from pathlib import Path
|
8 |
from urllib.parse import parse_qs
|
9 |
+
|
10 |
import gradio as gr
|
|
|
|
|
|
|
11 |
from dotenv import load_dotenv
|
12 |
+
from huggingface_hub import Repository
|
13 |
+
from langchain import ConversationChain
|
14 |
+
from langchain.chains.conversation.memory import ConversationBufferMemory
|
15 |
+
from langchain.llms import HuggingFaceHub
|
16 |
+
from langchain.prompts import load_prompt
|
17 |
+
|
18 |
from utils import force_git_push
|
|
|
19 |
|
20 |
# These variables are for storing the mturk HITs in a Hugging Face dataset.
|
21 |
if Path(".env").is_file():
|
|
|
23 |
DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
|
24 |
FORCE_PUSH = os.getenv("FORCE_PUSH")
|
25 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
26 |
+
PROMPT_TEMPLATES = Path("prompt_templates")
|
27 |
+
# Set env variable for langchain to communicate with Hugging Face Hub
|
28 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
|
29 |
+
|
30 |
DATA_FILENAME = "data.jsonl"
|
31 |
DATA_FILE = os.path.join("data", DATA_FILENAME)
|
32 |
repo = Repository(
|
|
|
56 |
asynchronous_push(f_stop)
|
57 |
|
58 |
# Now let's run the app!
|
59 |
+
prompt = load_prompt(PROMPT_TEMPLATES / "openai_chatgpt.json")
|
60 |
+
|
61 |
+
chatbot_1 = ConversationChain(
|
62 |
+
llm=HuggingFaceHub(
|
63 |
+
repo_id="google/flan-t5-xl",
|
64 |
+
model_kwargs={"temperature": 1}
|
65 |
+
),
|
66 |
+
prompt=prompt,
|
67 |
+
verbose=False,
|
68 |
+
memory=ConversationBufferMemory(ai_prefix="Assistant"),
|
69 |
+
)
|
70 |
+
|
71 |
+
chatbot_2 = ConversationChain(
|
72 |
+
llm=HuggingFaceHub(
|
73 |
+
repo_id="bigscience/bloom",
|
74 |
+
model_kwargs={"temperature": 0.7}
|
75 |
+
),
|
76 |
+
prompt=prompt,
|
77 |
+
verbose=False,
|
78 |
+
memory=ConversationBufferMemory(ai_prefix="Assistant"),
|
79 |
+
)
|
80 |
+
|
81 |
+
chatbot_3 = ConversationChain(
|
82 |
+
llm=HuggingFaceHub(
|
83 |
+
repo_id="bigscience/T0_3B",
|
84 |
+
model_kwargs={"temperature": 1}
|
85 |
+
),
|
86 |
+
prompt=prompt,
|
87 |
+
verbose=False,
|
88 |
+
memory=ConversationBufferMemory(ai_prefix="Assistant"),
|
89 |
+
)
|
90 |
+
|
91 |
+
chatbot_4 = ConversationChain(
|
92 |
+
llm=HuggingFaceHub(
|
93 |
+
repo_id="EleutherAI/gpt-j-6B",
|
94 |
+
model_kwargs={"temperature": 1}
|
95 |
+
),
|
96 |
+
prompt=prompt,
|
97 |
+
verbose=False,
|
98 |
+
memory=ConversationBufferMemory(ai_prefix="Assistant"),
|
99 |
+
)
|
100 |
|
101 |
demo = gr.Blocks()
|
102 |
|
|
|
112 |
"generated_responses": [],
|
113 |
"response_1": "",
|
114 |
"response_2": "",
|
115 |
+
"response_3": "",
|
116 |
+
"response_4": "",
|
117 |
}
|
118 |
state = gr.JSON(state_dict, visible=False)
|
119 |
|
|
|
123 |
state_display = gr.Markdown(f"Your messages: 0/{TOTAL_CNT}")
|
124 |
|
125 |
# Generate model prediction
|
|
|
126 |
def _predict(txt, state):
|
127 |
+
# TODO: parallelize this!
|
128 |
+
response_1 = chatbot_1.predict(input=txt)
|
129 |
+
response_2 = chatbot_2.predict(input=txt)
|
130 |
+
response_3 = chatbot_3.predict(input=txt)
|
131 |
+
response_4 = chatbot_4.predict(input=txt)
|
132 |
+
|
133 |
+
response2model = {}
|
134 |
+
response2model[response_1] = chatbot_1.llm.repo_id
|
135 |
+
response2model[response_2] = chatbot_2.llm.repo_id
|
136 |
+
response2model[response_3] = chatbot_3.llm.repo_id
|
137 |
+
response2model[response_4] = chatbot_4.llm.repo_id
|
138 |
|
139 |
state["cnt"] += 1
|
140 |
|
141 |
new_state_md = f"Inputs remaining in HIT: {state['cnt']}/{TOTAL_CNT}"
|
142 |
|
143 |
+
state["data"].append({"cnt": state["cnt"], "text": txt, "response_1": response_1, "response_2": response_2, "response_3": response_3, "response_4": response_4,"response2model": response2model})
|
144 |
state["past_user_inputs"].append(txt)
|
145 |
|
146 |
past_conversation_string = "<br />".join(["<br />".join(["😃: " + user_input, "🤖: " + model_response]) for user_input, model_response in zip(state["past_user_inputs"], state["generated_responses"] + [""])])
|
147 |
+
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True, choices=[response_1, response_2, response_3, response_4], interactive=True, value=response_1), gr.update(value=past_conversation_string), state, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), new_state_md, dummy
|
148 |
|
149 |
def _select_response(selected_response, state, dummy):
|
150 |
done = state["cnt"] == TOTAL_CNT
|
151 |
state["generated_responses"].append(selected_response)
|
152 |
state["data"][-1]["selected_response"] = selected_response
|
153 |
+
state["data"][-1]["selected_model"] = state["data"][-1]["response2model"][selected_response]
|
154 |
if state["cnt"] == TOTAL_CNT:
|
155 |
# Write the HIT data to our local dataset because the worker has
|
156 |
# submitted everything now.
|
config.py.example
CHANGED
@@ -3,4 +3,4 @@
|
|
3 |
# and Access Management (IAM) panel.
|
4 |
|
5 |
MTURK_KEY = ''
|
6 |
-
MTURK_SECRET = '
|
|
|
3 |
# and Access Management (IAM) panel.
|
4 |
|
5 |
MTURK_KEY = ''
|
6 |
+
MTURK_SECRET = ''
|
prompt_templates/openai_chatgpt.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"input_variables": [
|
3 |
+
"history",
|
4 |
+
"input"
|
5 |
+
],
|
6 |
+
"output_parser": null,
|
7 |
+
"template": "Assistant is a large language model trained by OpenAI.\n\nAssistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.\n\nAssistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.\n\nOverall, Assistant is a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.\n\n{history}\nHuman: {input}\nAssistant:",
|
8 |
+
"template_format": "f-string"
|
9 |
+
}
|
requirements.txt
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
torch==1.12.0
|
2 |
-
transformers==4.20.1
|
3 |
boto3==1.24.32
|
4 |
huggingface_hub==0.8.1
|
5 |
python-dotenv==0.20.0
|
|
|
|
|
|
|
|
1 |
boto3==1.24.32
|
2 |
huggingface_hub==0.8.1
|
3 |
python-dotenv==0.20.0
|
4 |
+
langchain==0.0.74
|