Spaces:

Annorita
/

tokenizer_comparison

Sleeping

App Files Files Community

Annorita commited on Jan 23, 2024

Commit

ae7ad9f

1 Parent(s): 224f5e0

add model option

Browse files

Files changed (2) hide show

app.py +38 -13
utils.py +1 -1

app.py CHANGED Viewed

@@ -1,31 +1,56 @@
 import streamlit as st
 from utils import get_res
 st.sidebar.title('Tokenizers demo')
 #x = st.slider('Select a value')
 #st.write(x, 'squared is', x * x)
-st.sidebar.subheader('Choose the tokenizer', divider='grey')
-option = st.sidebar.selectbox(
-    'model_name',
-    ['deepseek-ai/deepseek-coder-1.3b-instruct',
-     'bigcode/starcoder'])
-model_name = st.sidebar.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
 #'Your choice:', model_name
 st.sidebar.subheader('Write the input sentence', divider='grey')
-input_data = st.sidebar.text_input('Input Sentence', 'Hello world!!!')
-res, token_num = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
-#st.markdown('<style></style>')
-st.subheader('Tokenized result', divider='grey')
-st.markdown(res, unsafe_allow_html=True)
-st.subheader('Number of tokens', divider='grey')
-st.write(token_num)

 import streamlit as st
 from utils import get_res
 st.sidebar.title('Tokenizers demo')
 #x = st.slider('Select a value')
 #st.write(x, 'squared is', x * x)
+#st.sidebar.subheader('Choose the tokenizer', divider='grey')
+#option = st.sidebar.selectbox(
+#    'model_name',
+#    ['deepseek-ai/deepseek-coder-1.3b-instruct',
+#     'bigcode/starcoder'])
+model_name_A = st.sidebar.text_input('Model Name A', 'deepseek-ai/deepseek-coder-1.3b-instruct')
+model_name_B = st.sidebar.text_input('Model Name B', 'deepseek-ai/deepseek-coder-1.3b-instruct')
+model_option = ['deepseek-ai/deepseek-coder-1.3b-instruct',
+             'MediaTek-Research/Breeze-7B-Instruct-64k-v0_1',
+             'microsoft/phi-2']
+with st.sidebar.expander("Models that you might want"):
+    for m in model_option:
+        st.write(m)
 #'Your choice:', model_name
 st.sidebar.subheader('Write the input sentence', divider='grey')
+input_data = st.sidebar.text_input('Input Sentence', 'Hello sunshine!!!')
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader(model_name_A, divider='grey')
+    res, token_num = get_res(model_name=model_name_A, input_sentence=input_data, single_print=False)
+    st.subheader('Tokenized result')
+    st.markdown(res, unsafe_allow_html=True)
+    st.subheader('Number of tokens')
+    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>',
+                unsafe_allow_html=True)
+with col2:
+    st.subheader(model_name_B, divider='grey')
+    res, token_num = get_res(model_name=model_name_B, input_sentence=input_data, single_print=False)
+    st.subheader('Tokenized result')
+    st.markdown(res, unsafe_allow_html=True)
+    st.subheader('Number of tokens')
+    st.markdown(f'<span style="font-size:1.875em">{str(token_num)}</span>',
+                unsafe_allow_html=True)

utils.py CHANGED Viewed

@@ -16,7 +16,7 @@ def get_res(model_name, input_sentence, single_print=True):
     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
-    w = [ f'<span style="background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
     res = ''.join(w)
     if single_print:
         print(res + str(token_num))

     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
+    w = [ f'<span style="font-size:1.25em;background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
     res = ''.join(w)
     if single_print:
         print(res + str(token_num))