|
import streamlit as st |
|
from transformers import AutoTokenizer, PreTrainedTokenizerFast |
|
from streamlit_ace import st_ace |
|
|
|
|
|
@st.cache(allow_output_mutation=True) |
|
def get_tokenizer(name): |
|
return ( |
|
PreTrainedTokenizerFast(tokenizer_file="neox_tokenizer.json") |
|
if name == "gpt-neox" |
|
else AutoTokenizer.from_pretrained(name) |
|
) |
|
|
|
|
|
def examine(tokenizer, tok_name, inp): |
|
enc = tokenizer.encode(inp) |
|
reconstructed_inp = tokenizer.decode(enc) |
|
reconstructed_inp2 = tokenizer.decode(enc, skip_special_tokens=True) |
|
st.header(f"{tok_name}") |
|
st.subheader(f"#tokens = {len(enc)}") |
|
st.text(f"Exact match = {reconstructed_inp2 == inp}") |
|
st.code(reconstructed_inp) |
|
st.write(enc) |
|
|
|
|
|
tokenizer_name_1 = st.text_input("Tokenizer 1 to examine", "gpt-neox") |
|
tokenizer_name_2 = st.text_input("Tokenizer 2 to examine", "t5-small") |
|
tokenizer1 = get_tokenizer(tokenizer_name_1) |
|
tokenizer2 = get_tokenizer(tokenizer_name_2) |
|
inp = st_ace("Find {eq}\\dfrac{\\partial f}{\\partial x} \\text{ and } \\dfrac{\\partial f}{\\partial y}{/eq} for {eq}f(x,y) = 13(8x - 7y+3)^5{/eq}.") |
|
col0, col1 = st.columns(2) |
|
with col0: |
|
examine(tokenizer1, tokenizer_name_1, inp) |
|
with col1: |
|
examine(tokenizer2, tokenizer_name_2, inp) |
|
|