taka-yamakoshi commited on
Commit
5e5793b
1 Parent(s): 8d5d895

first commit

Browse files
Files changed (3) hide show
  1. app.py +72 -0
  2. packages.txt +1 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AlbertTokenizer
6
+ import io
7
+ import time
8
+
9
+ @st.cache(show_spinner=True,allow_output_mutation=True)
10
+ def load_model(model_name):
11
+ if model_name.startswith('albert'):
12
+ tokenizer = AlbertTokenizer.from_pretrained(model_name)
13
+ return tokenizer
14
+
15
+
16
+ if __name__=='__main__':
17
+
18
+ # Config
19
+ max_width = 1500
20
+ padding_top = 0
21
+ padding_right = 2
22
+ padding_bottom = 0
23
+ padding_left = 2
24
+
25
+ define_margins = f"""
26
+ <style>
27
+ .appview-container .main .block-container{{
28
+ max-width: {max_width}px;
29
+ padding-top: {padding_top}rem;
30
+ padding-right: {padding_right}rem;
31
+ padding-left: {padding_left}rem;
32
+ padding-bottom: {padding_bottom}rem;
33
+ }}
34
+ </style>
35
+ """
36
+ hide_table_row_index = """
37
+ <style>
38
+ tbody th {display:none}
39
+ .blank {display:none}
40
+ </style>
41
+ """
42
+ st.markdown(define_margins, unsafe_allow_html=True)
43
+ st.markdown(hide_table_row_index, unsafe_allow_html=True)
44
+ input_type = st.sidebar.radio(
45
+ label='1. Choose the input type',
46
+ on_change=clear_df,
47
+ options=('Use one of the example sentences','Use your own initial sentence')
48
+ )
49
+
50
+ # Title
51
+ st.header("Tokenizer Demo")
52
+
53
+ tokenizer = load_model('albert-xxlarge-v2')
54
+ sent_cols = st.columns(2)
55
+ num_tokens = {}
56
+ for sent_id, sent_col in enumerate(sent_cols):
57
+ with sent_col:
58
+ sentence = st.text_input(f'Sentence {sent_id+1}')
59
+ input_sent = tokenizer(sentence)['input_ids']
60
+ decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
61
+ num_tokens[f'sent_{sent_id}'] = len(decoded_sent)
62
+
63
+ char_nums = [len(word)+2 for word in decoded_sent]
64
+ word_cols = st.columns(char_nums)
65
+ for word_col,word in zip(word_cols,decoded_sent):
66
+ with word_col:
67
+ st.write(word)
68
+ st.write(f'{num_tokens[f'sent_{sent_id}']} tokens')
69
+ if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
70
+ st.write('Matched!')
71
+ else:
72
+ st.write('Not Matched...')
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libgl1-mesa-dev
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ sentence_transformers
4
+ opencv-python
5
+ seaborn
6
+ sklearn
7
+ protobuf~=3.19.0