import io import pandas as pd import streamlit as st from transformers import AutoTokenizer from tapas_visualizer import TapasVisualizer st.set_page_config(page_title="Tapas Tokenizer", page_icon="‍🍽️", layout="wide") def set_file_input(): st.session_state.input_stream = "file" def set_text_input(): st.session_state.input_stream = "text" def main(): models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"] @st.cache_resource() def load_tokenizer(): tokenizer = AutoTokenizer.from_pretrained(selected_model) return tokenizer st.markdown( """ ## TAPAS Tokenization Visualization [TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas) models work on Tables. The tool below is to help visualize how the table is tokenized and give total (+ row-wise) token counts. Implementation adapted from `tokenizers.tools.EncodingVisualizer`. """ ) col1, col2 = st.columns([1, 2]) with col1: selected_model = st.selectbox("Select a tokenizer", models, key=1) text = st.text_area( label="", placeholder="Table to tokenize; csv", on_change=set_text_input ) uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input) button_clicked = st.button("Tokenize") tokenizer = load_tokenizer() visualizer = TapasVisualizer(tokenizer) with col2: if text or uploaded_file or button_clicked: df: pd.DataFrame if ( "input_stream" not in st.session_state or st.session_state.input_stream == "text" ): df = pd.read_csv(io.StringIO(text), sep=",") elif st.session_state.input_stream == "file": df = pd.read_csv(uploaded_file) if df is not None: st.components.v1.html(visualizer(df.astype(str)), height=1500) if __name__ == "__main__": main()