tokenizer-demo / app.py
taka-yamakoshi
fix bugs
2d48eb3
raw history blame
No virus
2.05 kB
import pandas as pd
import streamlit as st
import numpy as np
import torch
from transformers import AlbertTokenizer
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(model_name):
if model_name.startswith('albert'):
tokenizer = AlbertTokenizer.from_pretrained(model_name)
return tokenizer
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
<style>
.appview-container .main .block-container{{
max-width: {max_width}px;
padding-top: {padding_top}rem;
padding-right: {padding_right}rem;
padding-left: {padding_left}rem;
padding-bottom: {padding_bottom}rem;
}}
</style>
"""
hide_table_row_index = """
<style>
tbody th {display:none}
.blank {display:none}
</style>
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# Title
st.header("Tokenizer Demo")
tokenizer = load_model('albert-xxlarge-v2')
sent_cols = st.columns(2)
num_tokens = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
sentence = st.text_input(f'Sentence {sent_id+1}')
input_sent = tokenizer(sentence)['input_ids']
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
num_tokens[f'sent_{sent_id}'] = len(decoded_sent)
char_nums = [len(word)+2 for word in decoded_sent]
word_cols = st.columns(char_nums)
for word_col,word in zip(word_cols,decoded_sent):
with word_col:
st.write(word)
st.write(f'{len(decoded_sent)} tokens')
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.subheader('Matched!')
else:
st.subheader('Not Matched...')