Spaces:
Runtime error
Runtime error
import io | |
import pandas as pd | |
import streamlit as st | |
from transformers import AutoTokenizer | |
from tapas_visualizer import TapasVisualizer | |
st.set_page_config(page_title="Tapas Tokenizer", page_icon="🍽️", layout="wide") | |
def set_file_input(): | |
st.session_state.input_stream = "file" | |
def set_text_input(): | |
st.session_state.input_stream = "text" | |
def main(): | |
models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"] | |
def load_tokenizer(): | |
tokenizer = AutoTokenizer.from_pretrained(selected_model) | |
return tokenizer | |
st.markdown( | |
""" | |
## TAPAS Tokenization Visualization | |
[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas) models work on Tables. | |
The tool below is to help visualize how the table is tokenized and give total (+ row-wise) | |
token counts. | |
Implementation adapted from `tokenizers.tools.EncodingVisualizer`. | |
""" | |
) | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
selected_model = st.selectbox("Select a tokenizer", models, key=1) | |
text = st.text_area( | |
label="", placeholder="Table to tokenize; csv", on_change=set_text_input | |
) | |
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input) | |
button_clicked = st.button("Tokenize") | |
tokenizer = load_tokenizer() | |
visualizer = TapasVisualizer(tokenizer) | |
with col2: | |
if text or uploaded_file or button_clicked: | |
df: pd.DataFrame | |
if ( | |
"input_stream" not in st.session_state | |
or st.session_state.input_stream == "text" | |
): | |
df = pd.read_csv(io.StringIO(text), sep=",") | |
elif st.session_state.input_stream == "file": | |
df = pd.read_csv(uploaded_file) | |
if df is not None: | |
st.components.v1.html(visualizer(df.astype(str)), height=1500) | |
if __name__ == "__main__": | |
main() | |