import pandas as pd import streamlit as st import numpy as np import torch from transformers import AlbertTokenizer import io import time @st.cache(show_spinner=True,allow_output_mutation=True) def load_model(model_name): if model_name.startswith('albert'): tokenizer = AlbertTokenizer.from_pretrained(model_name) return tokenizer if __name__=='__main__': # Config max_width = 1500 padding_top = 0 padding_right = 2 padding_bottom = 0 padding_left = 2 define_margins = f""" """ hide_table_row_index = """ """ st.markdown(define_margins, unsafe_allow_html=True) st.markdown(hide_table_row_index, unsafe_allow_html=True) # Title st.header("Tokenizer Demo") tokenizer = load_model('albert-xxlarge-v2') sent_cols = st.columns(2) num_tokens = {} for sent_id, sent_col in enumerate(sent_cols): with sent_col: sentence = st.text_input(f'Sentence {sent_id+1}') input_sent = tokenizer(sentence)['input_ids'] decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]] num_tokens[f'sent_{sent_id}'] = len(decoded_sent) char_nums = [len(word)+2 for word in decoded_sent] word_cols = st.columns(char_nums) for word_col,word in zip(word_cols,decoded_sent): with word_col: st.write(word) st.write(f'{len(decoded_sent)} tokens') if num_tokens[f'sent_1']==num_tokens[f'sent_2']: st.subheader('Matched!') else: st.subheader('Not Matched...')