mmnga commited on
Commit
4a1e33f
1 Parent(s): 4301244
Files changed (1) hide show
  1. app.py +90 -66
app.py CHANGED
@@ -48,84 +48,108 @@ selected_vocab = st.sidebar.radio("", list(vocab_list.keys()))
48
  repo_id = vocab_list[selected_vocab]
49
  tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def escape_decode(token):
52
  ret = token
53
  tok_enc = token.encode()
54
  if len(tok_enc) == 1:
55
  b = tok_enc[0]
56
-
57
- if b == 0x00:
58
- ret = "<!!!!VOCABVIEWER ASCII 0x00 NUL!!!!>"
59
- elif b == 0x01:
60
- ret = "<!!!!VOCABVIEWER ASCII 0x01 SOH!!!!>"
61
- elif b == 0x02:
62
- ret = "<!!!!VOCABVIEWER ASCII 0x02 STX!!!!>"
63
- elif b == 0x03:
64
- ret = "<!!!!VOCABVIEWER ASCII 0x03 ETX!!!!>"
65
- elif b == 0x04:
66
- ret = "<!!!!VOCABVIEWER ASCII 0x04 EOT!!!!>"
67
- elif b == 0x05:
68
- ret = "<!!!!VOCABVIEWER ASCII 0x05 ENQ!!!!>"
69
- elif b == 0x06:
70
- ret = "<!!!!VOCABVIEWER ASCII 0x06 ACK!!!!>"
71
- elif b == 0x07:
72
- ret = "<!!!!VOCABVIEWER ASCII 0x07 BEL!!!!>"
73
- elif b == 0x08:
74
- ret = "<!!!!VOCABVIEWER ASCII 0x08 BS!!!!>"
75
- elif b == 0x0a:
76
- ret = "<!!!!VOCABVIEWER ASCII 0x0a LF!!!!>"
77
- elif b == 0x0b:
78
- ret = "<!!!!VOCABVIEWER ASCII 0x0b VT!!!!>"
79
- elif b == 0x0c:
80
- ret = "<!!!!VOCABVIEWER ASCII 0x0c FF!!!!>"
81
- elif b == 0x0d:
82
- ret = "<!!!!VOCABVIEWER ASCII 0x0d CR!!!!>"
83
- elif b == 0x0e:
84
- ret = "<!!!!VOCABVIEWER ASCII 0x0e SO!!!!>"
85
- elif b == 0x0f:
86
- ret = "<!!!!VOCABVIEWER ASCII 0x0f SI!!!!>"
87
- elif b == 0x10:
88
- ret = "<!!!!VOCABVIEWER ASCII 0x10 DLE!!!!>"
89
- elif b == 0x11:
90
- ret = "<!!!!VOCABVIEWER ASCII 0x11 DC1!!!!>"
91
- elif b == 0x12:
92
- ret = "<!!!!VOCABVIEWER ASCII 0x12 DC2!!!!>"
93
- elif b == 0x13:
94
- ret = "<!!!!VOCABVIEWER ASCII 0x13 DC3!!!!>"
95
- elif b == 0x14:
96
- ret = "<!!!!VOCABVIEWER ASCII 0x14 DC4!!!!>"
97
- elif b == 0x15:
98
- ret = "<!!!!VOCABVIEWER ASCII 0x15 NAK!!!!>"
99
- elif b == 0x16:
100
- ret = "<!!!!VOCABVIEWER ASCII 0x16 SYN!!!!>"
101
- elif b == 0x17:
102
- ret = "<!!!!VOCABVIEWER ASCII 0x17 ETB!!!!>"
103
- elif b == 0x18:
104
- ret = "<!!!!VOCABVIEWER ASCII 0x18 CAN!!!!>"
105
- elif b == 0x19:
106
- ret = "<!!!!VOCABVIEWER ASCII 0x19 EM!!!!>"
107
- elif b == 0x1a:
108
- ret = "<!!!!VOCABVIEWER ASCII 0x1a SUB!!!!>"
109
- elif b == 0x1b:
110
- ret = "<!!!!VOCABVIEWER ASCII 0x1b ESC!!!!>"
111
- elif b == 0x1c:
112
- ret = "<!!!!VOCABVIEWER ASCII 0x1c FS!!!!>"
113
- elif b == 0x1d:
114
- ret = "<!!!!VOCABVIEWER ASCII 0x1d GS!!!!>"
115
- elif b == 0x1e:
116
- ret = "<!!!!VOCABVIEWER ASCII 0x1e RS!!!!>"
117
- elif b == 0x1f:
118
- ret = "<!!!!VOCABVIEWER ASCII 0x1f US!!!!>"
119
- elif b == 0x20:
120
- ret = "<!!!!VOCABVIEWER ASCII 0x20 SPC!!!!>"
121
  elif b == 0x7f:
122
- ret = "<!!!!VOCABVIEWER ASCII 0x7f DEL!!!!>"
123
  elif tok_enc == b"\xef\xbf\xbd":
124
  ret = "<!!!!VOCABVIEWER REPLACEMENT CHARACTER U+FFFD!!!!>"
125
  elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf":
126
  ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
127
  elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" :
128
  ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  return ret
131
  # sort
 
48
  repo_id = vocab_list[selected_vocab]
49
  tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
50
 
51
+ def escape_byte(b):
52
+ ret = ""
53
+ if b == 0x00:
54
+ ret = "<!!!!VOCABVIEWER ASCII 0x00 NUL!!!!>"
55
+ elif b == 0x01:
56
+ ret = "<!!!!VOCABVIEWER ASCII 0x01 SOH!!!!>"
57
+ elif b == 0x02:
58
+ ret = "<!!!!VOCABVIEWER ASCII 0x02 STX!!!!>"
59
+ elif b == 0x03:
60
+ ret = "<!!!!VOCABVIEWER ASCII 0x03 ETX!!!!>"
61
+ elif b == 0x04:
62
+ ret = "<!!!!VOCABVIEWER ASCII 0x04 EOT!!!!>"
63
+ elif b == 0x05:
64
+ ret = "<!!!!VOCABVIEWER ASCII 0x05 ENQ!!!!>"
65
+ elif b == 0x06:
66
+ ret = "<!!!!VOCABVIEWER ASCII 0x06 ACK!!!!>"
67
+ elif b == 0x07:
68
+ ret = "<!!!!VOCABVIEWER ASCII 0x07 BEL!!!!>"
69
+ elif b == 0x08:
70
+ ret = "<!!!!VOCABVIEWER ASCII 0x08 BS!!!!>"
71
+ elif b == 0x0a:
72
+ ret = "<!!!!VOCABVIEWER ASCII 0x0a LF!!!!>"
73
+ elif b == 0x0b:
74
+ ret = "<!!!!VOCABVIEWER ASCII 0x0b VT!!!!>"
75
+ elif b == 0x0c:
76
+ ret = "<!!!!VOCABVIEWER ASCII 0x0c FF!!!!>"
77
+ elif b == 0x0d:
78
+ ret = "<!!!!VOCABVIEWER ASCII 0x0d CR!!!!>"
79
+ elif b == 0x0e:
80
+ ret = "<!!!!VOCABVIEWER ASCII 0x0e SO!!!!>"
81
+ elif b == 0x0f:
82
+ ret = "<!!!!VOCABVIEWER ASCII 0x0f SI!!!!>"
83
+ elif b == 0x10:
84
+ ret = "<!!!!VOCABVIEWER ASCII 0x10 DLE!!!!>"
85
+ elif b == 0x11:
86
+ ret = "<!!!!VOCABVIEWER ASCII 0x11 DC1!!!!>"
87
+ elif b == 0x12:
88
+ ret = "<!!!!VOCABVIEWER ASCII 0x12 DC2!!!!>"
89
+ elif b == 0x13:
90
+ ret = "<!!!!VOCABVIEWER ASCII 0x13 DC3!!!!>"
91
+ elif b == 0x14:
92
+ ret = "<!!!!VOCABVIEWER ASCII 0x14 DC4!!!!>"
93
+ elif b == 0x15:
94
+ ret = "<!!!!VOCABVIEWER ASCII 0x15 NAK!!!!>"
95
+ elif b == 0x16:
96
+ ret = "<!!!!VOCABVIEWER ASCII 0x16 SYN!!!!>"
97
+ elif b == 0x17:
98
+ ret = "<!!!!VOCABVIEWER ASCII 0x17 ETB!!!!>"
99
+ elif b == 0x18:
100
+ ret = "<!!!!VOCABVIEWER ASCII 0x18 CAN!!!!>"
101
+ elif b == 0x19:
102
+ ret = "<!!!!VOCABVIEWER ASCII 0x19 EM!!!!>"
103
+ elif b == 0x1a:
104
+ ret = "<!!!!VOCABVIEWER ASCII 0x1a SUB!!!!>"
105
+ elif b == 0x1b:
106
+ ret = "<!!!!VOCABVIEWER ASCII 0x1b ESC!!!!>"
107
+ elif b == 0x1c:
108
+ ret = "<!!!!VOCABVIEWER ASCII 0x1c FS!!!!>"
109
+ elif b == 0x1d:
110
+ ret = "<!!!!VOCABVIEWER ASCII 0x1d GS!!!!>"
111
+ elif b == 0x1e:
112
+ ret = "<!!!!VOCABVIEWER ASCII 0x1e RS!!!!>"
113
+ elif b == 0x1f:
114
+ ret = "<!!!!VOCABVIEWER ASCII 0x1f US!!!!>"
115
+ elif b == 0x20:
116
+ # ret = "<!!!!VOCABVIEWER ASCII 0x20 SPC!!!!>"
117
+ ret = " "
118
+ elif b == 0x7f:
119
+ ret = "<!!!!VOCABVIEWER ASCII 0x7f DEL!!!!>"
120
+
121
+ return ret
122
+
123
  def escape_decode(token):
124
  ret = token
125
  tok_enc = token.encode()
126
  if len(tok_enc) == 1:
127
  b = tok_enc[0]
128
+ if b >= 0x00 and b <= 0x08:
129
+ ret = escape_byte(b)
130
+ elif b >= 0x0a and b <= 0x19:
131
+ ret = escape_byte(b)
132
+ elif b >= 0x1a and b <= 0x20:
133
+ ret = escape_byte(b)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  elif b == 0x7f:
135
+ ret = escape_byte(b)
136
  elif tok_enc == b"\xef\xbf\xbd":
137
  ret = "<!!!!VOCABVIEWER REPLACEMENT CHARACTER U+FFFD!!!!>"
138
  elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf":
139
  ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
140
  elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" :
141
  ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
142
+ else:
143
+ escape_tok_enc = bytes()
144
+ for b in tok_enc:
145
+ es = escape_byte(b)
146
+ if len(es) > 0:
147
+ escape_tok_enc += es.encode()
148
+ else:
149
+ # add byte
150
+ escape_tok_enc += b.to_bytes(1, "big")
151
+
152
+ ret = escape_tok_enc.decode()
153
 
154
  return ret
155
  # sort