Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

tokenizer-demo / app.py

taka-yamakoshi

first commit

5e5793b almost 2 years ago

2.25 kB

	import pandas as pd
	import streamlit as st
	import numpy as np
	import torch
	from transformers import AlbertTokenizer
	import io
	import time

	@st.cache(show_spinner=True,allow_output_mutation=True)
	def load_model(model_name):
	if model_name.startswith('albert'):
	tokenizer = AlbertTokenizer.from_pretrained(model_name)
	return tokenizer


	if __name__=='__main__':

	# Config
	max_width = 1500
	padding_top = 0
	padding_right = 2
	padding_bottom = 0
	padding_left = 2

	define_margins = f"""
	<style>
	.appview-container .main .block-container{{
	max-width: {max_width}px;
	padding-top: {padding_top}rem;
	padding-right: {padding_right}rem;
	padding-left: {padding_left}rem;
	padding-bottom: {padding_bottom}rem;
	}}
	</style>
	"""
	hide_table_row_index = """
	<style>
	tbody th {display:none}
	.blank {display:none}
	</style>
	"""
	st.markdown(define_margins, unsafe_allow_html=True)
	st.markdown(hide_table_row_index, unsafe_allow_html=True)
	input_type = st.sidebar.radio(
	label='1. Choose the input type',
	on_change=clear_df,
	options=('Use one of the example sentences','Use your own initial sentence')
	)

	# Title
	st.header("Tokenizer Demo")

	tokenizer = load_model('albert-xxlarge-v2')
	sent_cols = st.columns(2)
	num_tokens = {}
	for sent_id, sent_col in enumerate(sent_cols):
	with sent_col:
	sentence = st.text_input(f'Sentence {sent_id+1}')
	input_sent = tokenizer(sentence)['input_ids']
	decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
	num_tokens[f'sent_{sent_id}'] = len(decoded_sent)

	char_nums = [len(word)+2 for word in decoded_sent]
	word_cols = st.columns(char_nums)
	for word_col,word in zip(word_cols,decoded_sent):
	with word_col:
	st.write(word)
	st.write(f'{num_tokens[f'sent_{sent_id}']} tokens')
	if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
	st.write('Matched!')
	else:
	st.write('Not Matched...')