Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import AutoTokenizer | |
import json | |
import tempfile | |
import os | |
import uuid | |
import copy | |
import shutil | |
st.set_page_config(layout="wide") | |
def sanitize_jinja2(jinja_lines): | |
one_liner_jinja = "" | |
for line in jinja_lines: | |
one_liner_jinja += line.lstrip(" ").rstrip("\n") | |
return one_liner_jinja | |
def get_existing_templates(): | |
return [None] + os.listdir("./templates") | |
# if os.path.exists("./tmp"): | |
# if len(os.listdir("./tmp")) > 20: | |
# shutil.rmtree('./tmp') | |
# Initialization | |
if 'tokenizer_json' not in st.session_state: | |
st.session_state['tokenizer_json'] = None | |
if 'tokenizer' not in st.session_state: | |
st.session_state['tokenizer'] = None | |
if 'repo_normalized_name' not in st.session_state: | |
st.session_state['repo_normalized_name'] = None | |
if 'repo_id' not in st.session_state: | |
st.session_state['repo_id'] = None | |
if 'input_jinja_template' not in st.session_state: | |
st.session_state['input_jinja_template'] = "" | |
if 'uuid' not in st.session_state: | |
st.session_state['uuid'] = uuid.uuid4() | |
os.makedirs(f"./tmp/{st.session_state['uuid']}") | |
if 'successful_template' not in st.session_state: | |
st.session_state['successful_template'] = '' | |
if 'generated_prompt_w_add_generation_prompt' not in st.session_state: | |
st.session_state['generated_prompt_w_add_generation_prompt'] = '' | |
if 'generated_prompt_wo_add_generation_prompt' not in st.session_state: | |
st.session_state['generated_prompt_wo_add_generation_prompt'] = '' | |
if not os.path.exists("./tmp"): | |
os.makedirs("./tmp") | |
title_description = """ | |
Chat Template Generation: Make Chat Easier with Huggingface Tokenizer | |
""" | |
st.title(title_description) | |
st.markdown('This streamlit app is to serve as an easier way to check and push the chat template to your/exisiting huggingface repo') | |
list_of_templates = get_existing_templates() | |
with st.expander("Current predefined templates"): | |
for model in list_of_templates[1:]: | |
st.markdown(f"- {model}") | |
st.info('More templates will be predefined for easier setup of chat template.', icon="ℹ️") | |
st.divider() | |
# custom_repo_tab, prebuilt_template_tab = st.tabs(["Specify Custom Repository Path", "Select Prebuilt Template"]) | |
hf_model_repo_name = st.text_input("Hugging Face Model Repository To Update", value="tiiuae/falcon-7b", max_chars=None, key=None, type="default", | |
help=None, autocomplete=None, label_visibility="visible") | |
gen_button = st.button("Get Tokenizer Config") | |
if gen_button: | |
with st.spinner(text="In progress...", cache=False): | |
st.session_state['repo_id'] = hf_model_repo_name | |
st.session_state['tokenizer'] = AutoTokenizer.from_pretrained(hf_model_repo_name) | |
st.session_state['repo_normalized_name'] = hf_model_repo_name.replace("/", "_") | |
st.session_state['tokenizer_json'] = f"./tmp/{st.session_state['uuid']}_{hf_model_repo_name}" | |
# st.session_state['tokenizer'].save_pretrained(st.session_state['tokenizer_json']) | |
if st.session_state['tokenizer_json'] is not None: | |
st.session_state['tokenizer'].save_pretrained(st.session_state['tokenizer_json']) | |
with open(f"{st.session_state['tokenizer_json']}/tokenizer_config.json", "rb") as f: | |
tokenizer_json = json.load(f) | |
shutil.rmtree(st.session_state['tokenizer_json']) | |
json_spec, col2 = st.columns(spec=[0.3, 0.7]) | |
with json_spec: | |
st.markdown(f"### Tokenizer Config from {st.session_state['repo_normalized_name']}") | |
st.json(json.dumps(tokenizer_json, indent=4)) | |
with col2: | |
chat = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Hello, how are you?"}, | |
{"role": "assistant", "content": "I'm doing great. How can I help you today?"}, | |
{"role": "user", "content": "I'd like to show off how chat templating works!"}, | |
] | |
st.markdown("### Example Conversation") | |
st.json(json.dumps(chat, indent=4), expanded=False) | |
prompt_template_col, prompt_template_output_col = st.columns(spec=[0.3, 0.7]) | |
with prompt_template_col: | |
list_of_templates = get_existing_templates() | |
selected_template = st.selectbox("Choose Existing Template or Leave Blank. (If template is None, it will check current tokenizer's `chat_template` and `default_chat_template` fields)", | |
options=list_of_templates, | |
index=0, placeholder="Choose a template (If template is None, it will check current tokenizer `chat_template` and `default_chat_template` fields)", disabled=False, label_visibility="visible") | |
# add_generation_prompt_checkbox = st.checkbox("add_generation_prompt") | |
generate_prompt_example_button = st.button("Generate Prompt", key="generate_prompt_example_button") | |
# if selected_template is None: | |
# st.session_state['input_jinja_template'] = st.text_area( | |
# "Jinja Chat Template", value=st.session_state['input_jinja_template'], | |
# height=500, placeholder=None, disabled=False, label_visibility="visible") | |
if selected_template is not None: | |
with open(f"./templates/{selected_template}", "r") as f: | |
jinja_lines = f.readlines() | |
st.session_state['input_jinja_template'] = "".join(jinja_lines) | |
if selected_template is None: | |
st.session_state['input_jinja_template'] = st.session_state['tokenizer'].chat_template | |
if st.session_state['input_jinja_template'] is None: | |
st.session_state['input_jinja_template'] = st.session_state['tokenizer'].default_chat_template | |
st.session_state['input_jinja_template'] = st.text_area( | |
"Jinja Chat Template", value=st.session_state['input_jinja_template'], | |
height=500, placeholder=None, disabled=False, label_visibility="visible") | |
with prompt_template_output_col: | |
# print(st.session_state['input_jinja_template']) | |
if generate_prompt_example_button: | |
with open(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json", "w") as fp: | |
fp.write(st.session_state['input_jinja_template']) | |
with open(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json", "r") as f: | |
jinja_lines = copy.deepcopy(f.readlines()) | |
st.session_state['tokenizer'].chat_template = sanitize_jinja2(jinja_lines) | |
# print(sanitize_jinja2(jinja_lines)) | |
os.remove(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json") | |
st.session_state['generated_prompt_wo_add_generation_prompt'] = st.session_state['tokenizer'].apply_chat_template(chat, tokenize=False, add_generation_prompt= False) | |
st.session_state['generated_prompt_w_add_generation_prompt'] = st.session_state['tokenizer'].apply_chat_template(chat, tokenize=False, add_generation_prompt= True) | |
# print(generated_prompt_wo_add_generation_prompt) | |
st.session_state['successful_template'] = copy.deepcopy(st.session_state['input_jinja_template']) | |
# print(st.session_state['successful_template']) | |
if len(st.session_state['successful_template']) > 0: | |
st.text_area( | |
"Generate Prompt with `add_generation_prompt=False`", value=st.session_state['generated_prompt_wo_add_generation_prompt'], | |
height=300, placeholder=None, disabled=True, label_visibility="visible", key="generated_prompt_wo_add_generation_prompt_text_area") | |
st.text_area( | |
"Generate Prompt with `add_generation_prompt=True`", value=st.session_state['generated_prompt_w_add_generation_prompt'], | |
height=300, placeholder=None, disabled=True, label_visibility="visible", key="generated_prompt_w_add_generation_prompt_text_area") | |
access_token_no_cache = st.text_input("HuggingFace Access Token API with Write Access", type="password", key="access_token_no_cache") | |
commit_message_text_input = st.text_input("Commit Message", key="commit_message_text_input") | |
to_private_checkbox = st.checkbox("To Private Repo", key="to_private_checkbox") | |
create_pr_checkbox = st.checkbox("Create PR (For Contribution 🤗)", key="create_pr_checkbox") | |
push_to_hub_button = st.button("Push to Hub", key="push_to_hub_button", use_container_width=True) | |
st.session_state['tokenizer'].save_pretrained(st.session_state['tokenizer_json']) | |
with open(f"{st.session_state['tokenizer_json']}/tokenizer_config.json", "r") as f: | |
tokenizer_config_content = json.loads(f.read()) | |
shutil.rmtree(st.session_state['tokenizer_json']) | |
st.download_button( | |
label="Download tokenizer_config.json", | |
data=json.dumps(tokenizer_config_content, indent=4), | |
file_name='tokenizer_config.json', | |
mime='application/json', | |
use_container_width=True | |
) | |
if push_to_hub_button: | |
with open(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json", "w") as fp: | |
fp.write(st.session_state['successful_template']) | |
with open(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json", "r") as f: | |
successful_jinja_lines = f.readlines() | |
st.session_state['tokenizer'].chat_template = sanitize_jinja2(successful_jinja_lines) | |
try: | |
with st.spinner(text="Pushing to hub ...", cache=False): | |
st.session_state['tokenizer'].push_to_hub( | |
repo_id=st.session_state['repo_id'], | |
commit_message=commit_message_text_input, | |
private=to_private_checkbox, | |
token=access_token_no_cache, | |
create_pr=create_pr_checkbox) | |
except Exception as e: | |
st.write(f"Repo id: {st.session_state['repo_id']}") | |
st.write(str(e)) | |
os.remove(f"./tmp/{st.session_state['uuid']}/tmp_chat_template.json") | |