Annorita commited on
Commit
ab98424
1 Parent(s): 1c58aa1
Files changed (2) hide show
  1. app.py +12 -13
  2. utils.py +3 -2
app.py CHANGED
@@ -1,29 +1,28 @@
1
  import streamlit as st
2
  from utils import get_res
3
 
4
- st.title('Tokenizers demo')
5
 
6
  #x = st.slider('Select a value')
7
  #st.write(x, 'squared is', x * x)
8
 
 
 
 
 
 
9
 
10
- option = st.selectbox(
11
- 'Choose a tokenizer',
12
- ['狗', '貓', '鸚鵡', '天竺鼠'])
13
- '你的答案:', option
14
 
 
15
 
16
- model_name = st.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
17
-
18
- 'Your choice:', model_name
19
-
20
- input_data = st.text_input('Input Sentence', 'Hello world!!!')
21
 
22
 
23
  res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
24
 
25
  #st.markdown('<style></style>')
26
 
27
- input_text = 'hiiiii'
28
- st.markdown(f'<p style="background-color:#0066cc">{input_text}</p>',
29
- unsafe_allow_html=True)
 
1
  import streamlit as st
2
  from utils import get_res
3
 
4
+ st.sidebar.title('Tokenizers demo')
5
 
6
  #x = st.slider('Select a value')
7
  #st.write(x, 'squared is', x * x)
8
 
9
+ st.sidebar.subheader('Choose the tokenizer', divider='grey')
10
+ option = st.sidebar.selectbox(
11
+ 'model_name',
12
+ ['deepseek-ai/deepseek-coder-1.3b-instruct',
13
+ 'bigcode/starcoder'])
14
 
15
+ model_name = st.sidebar.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
 
 
 
16
 
17
+ #'Your choice:', model_name
18
 
19
+ st.sidebar.subheader('Write the input sentence', divider='grey')
20
+ input_data = st.sidebar.text_input('Input Sentence', 'Hello world!!!')
 
 
 
21
 
22
 
23
  res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
24
 
25
  #st.markdown('<style></style>')
26
 
27
+ st.subheader('Tokenized result', divider='grey')
28
+ st.markdown(res, unsafe_allow_html=True)
 
utils.py CHANGED
@@ -5,7 +5,8 @@ import itertools
5
 
6
 
7
  def get_color():
8
- colors = [i for i in range(41, 48)]
 
9
  return itertools.cycle(colors)
10
 
11
  def get_res(model_name, input_sentence, single_print=True):
@@ -15,7 +16,7 @@ def get_res(model_name, input_sentence, single_print=True):
15
  out = tokenizer.encode(input_sentence, add_special_tokens=False)
16
  token_num = len(out)
17
 
18
- w = [ '\033[''1;'+str(next(color_iterator))+f'm {tokenizer.decode(x)}\033[m' for x in out]
19
  res = ''.join(w) + f' {str(token_num)}'
20
  if single_print:
21
  print(res)
 
5
 
6
 
7
  def get_color():
8
+ colors = ['#df7b55', '#2c7482', '#2c8234', '#5581df', '#822c63','#b355df']
9
+
10
  return itertools.cycle(colors)
11
 
12
  def get_res(model_name, input_sentence, single_print=True):
 
16
  out = tokenizer.encode(input_sentence, add_special_tokens=False)
17
  token_num = len(out)
18
 
19
+ w = [ f'<span style="background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
20
  res = ''.join(w) + f' {str(token_num)}'
21
  if single_print:
22
  print(res)