bglearning's picture
Add title and preamble
5008e92
import io
import pandas as pd
import streamlit as st
from transformers import AutoTokenizer
from tapas_visualizer import TapasVisualizer
st.set_page_config(page_title="Tapas Tokenizer", page_icon="‍🍽️", layout="wide")
def set_file_input():
st.session_state.input_stream = "file"
def set_text_input():
st.session_state.input_stream = "text"
def main():
models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"]
@st.cache_resource()
def load_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(selected_model)
return tokenizer
st.markdown(
"""
## TAPAS Tokenization Visualization
[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas) models work on Tables.
The tool below is to help visualize how the table is tokenized and give total (+ row-wise)
token counts.
Implementation adapted from `tokenizers.tools.EncodingVisualizer`.
"""
)
col1, col2 = st.columns([1, 2])
with col1:
selected_model = st.selectbox("Select a tokenizer", models, key=1)
text = st.text_area(
label="", placeholder="Table to tokenize; csv", on_change=set_text_input
)
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
button_clicked = st.button("Tokenize")
tokenizer = load_tokenizer()
visualizer = TapasVisualizer(tokenizer)
with col2:
if text or uploaded_file or button_clicked:
df: pd.DataFrame
if (
"input_stream" not in st.session_state
or st.session_state.input_stream == "text"
):
df = pd.read_csv(io.StringIO(text), sep=",")
elif st.session_state.input_stream == "file":
df = pd.read_csv(uploaded_file)
if df is not None:
st.components.v1.html(visualizer(df.astype(str)), height=1500)
if __name__ == "__main__":
main()