Spaces:

Annorita
/

tokenizer_comparison

Sleeping

File size: 2,588 Bytes

5ff29be
35996ec
5ff29be
ae7ad9f
ab98424
6893866
35996ec
 
96a0e76
ae7ad9f
 
7cf6c21
19898a1
1d128e9
 
 
19898a1
 
 
 
 
 
 
 
 
 
 
ae7ad9f
19898a1
 
 
 
 
 
 
 
 
 
 
 
96a0e76
1459d42
ab98424
1459d42
19898a1
 
 
 
 
 
 
 
 
 
 
 
ae7ad9f
 
 
 
 
 
 
35996ec
ae7ad9f
 
35996ec
ae7ad9f
 
 
35996ec
ae7ad9f
 
 
1c58aa1
ae7ad9f
 
224f5e0
ae7ad9f

import streamlit as st
from utils import get_res


st.sidebar.title('Tokenizers demo')

#x = st.slider('Select a value')
#st.write(x, 'squared is', x * x)

model_option = ['deepseek-ai/deepseek-coder-1.3b-instruct',
             'MediaTek-Research/Breeze-7B-Instruct-64k-v0_1',
             'microsoft/phi-2', 'enter by myself']

input_option = ['123.5', 'hello world!!!', '大雨＋寒流來襲！全台極凍72小時「探5度以下」',
                '大雨＋寒流来袭！全台极冻72小时「探5度以下」',
                 'enter by myself']


st.sidebar.subheader('Choose the tokenizer', divider='grey')
st.sidebar.write('You can choose `enter by myself` to paste the model you want.')
model_name_A = st.sidebar.selectbox(
    'Model Name A',
    model_option)


if model_name_A == 'enter by myself':
    model_name_A = st.sidebar.text_input('Please enter Model Name A', 'deepseek-ai/deepseek-coder-1.3b-instruct')

model_name_B = st.sidebar.selectbox(
    'Model Name B',
    model_option)

if model_name_B == 'enter by myself':
    model_name_B = st.sidebar.text_input('Please enter Model Name B', 'deepseek-ai/deepseek-coder-1.3b-instruct')



#with st.sidebar.expander("Models that you might want"):
#    for m in model_option:
#        st.write(m)


#'Your choice:', model_name

st.sidebar.subheader('Choose the input sentence', divider='grey')
st.sidebar.write('You can choose `enter by myself` to enter the text you want.')
input_data = st.sidebar.selectbox(
    'Input Sentence',
    input_option)

if input_data == 'enter by myself':
    input_data = st.sidebar.text_input('Write the Input Sentence', 'Hello sunshine!!!')

#with st.sidebar.expander("Input that you might want to test"):
#    for m in input_option:
#        st.write(m)


col1, col2 = st.columns(2)

with col1:
    st.subheader(model_name_A, divider='grey')
    res, token_num = get_res(model_name=model_name_A, input_sentence=input_data, single_print=False)

    st.subheader('Tokenized result')
    st.markdown(res, unsafe_allow_html=True)

    st.subheader('Number of tokens')
    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>', 
                unsafe_allow_html=True)

with col2:
    st.subheader(model_name_B, divider='grey')
    res, token_num = get_res(model_name=model_name_B, input_sentence=input_data, single_print=False)

    st.subheader('Tokenized result')
    st.markdown(res, unsafe_allow_html=True)

    st.subheader('Number of tokens')
    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>', 
                unsafe_allow_html=True)