import pandas as pd import streamlit as st import numpy as np import torch from transformers import AlbertTokenizer import io import time @st.cache(show_spinner=True,allow_output_mutation=True) def load_model(model_name): if model_name.startswith('albert'): tokenizer = AlbertTokenizer.from_pretrained(model_name) return tokenizer if __name__=='__main__': # Config max_width = 1500 padding_top = 0 padding_right = 2 padding_bottom = 0 padding_left = 2 define_margins = f""" """ hide_table_row_index = """ """ st.markdown(define_margins, unsafe_allow_html=True) st.markdown(hide_table_row_index, unsafe_allow_html=True) input_type = st.sidebar.radio( label='1. Choose the input type', on_change=clear_df, options=('Use one of the example sentences','Use your own initial sentence') ) # Title st.header("Tokenizer Demo") tokenizer = load_model('albert-xxlarge-v2') sent_cols = st.columns(2) num_tokens = {} for sent_id, sent_col in enumerate(sent_cols): with sent_col: sentence = st.text_input(f'Sentence {sent_id+1}') input_sent = tokenizer(sentence)['input_ids'] decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]] num_tokens[f'sent_{sent_id}'] = len(decoded_sent) char_nums = [len(word)+2 for word in decoded_sent] word_cols = st.columns(char_nums) for word_col,word in zip(word_cols,decoded_sent): with word_col: st.write(word) st.write(f'{num_tokens[f'sent_{sent_id}']} tokens') if num_tokens[f'sent_1']==num_tokens[f'sent_2']: st.write('Matched!') else: st.write('Not Matched...')