Spaces:
Running
Running
fix
Browse files
app.py
CHANGED
@@ -48,84 +48,108 @@ selected_vocab = st.sidebar.radio("", list(vocab_list.keys()))
|
|
48 |
repo_id = vocab_list[selected_vocab]
|
49 |
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def escape_decode(token):
|
52 |
ret = token
|
53 |
tok_enc = token.encode()
|
54 |
if len(tok_enc) == 1:
|
55 |
b = tok_enc[0]
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x02 STX!!!!>"
|
63 |
-
elif b == 0x03:
|
64 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x03 ETX!!!!>"
|
65 |
-
elif b == 0x04:
|
66 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x04 EOT!!!!>"
|
67 |
-
elif b == 0x05:
|
68 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x05 ENQ!!!!>"
|
69 |
-
elif b == 0x06:
|
70 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x06 ACK!!!!>"
|
71 |
-
elif b == 0x07:
|
72 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x07 BEL!!!!>"
|
73 |
-
elif b == 0x08:
|
74 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x08 BS!!!!>"
|
75 |
-
elif b == 0x0a:
|
76 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0a LF!!!!>"
|
77 |
-
elif b == 0x0b:
|
78 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0b VT!!!!>"
|
79 |
-
elif b == 0x0c:
|
80 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0c FF!!!!>"
|
81 |
-
elif b == 0x0d:
|
82 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0d CR!!!!>"
|
83 |
-
elif b == 0x0e:
|
84 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0e SO!!!!>"
|
85 |
-
elif b == 0x0f:
|
86 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x0f SI!!!!>"
|
87 |
-
elif b == 0x10:
|
88 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x10 DLE!!!!>"
|
89 |
-
elif b == 0x11:
|
90 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x11 DC1!!!!>"
|
91 |
-
elif b == 0x12:
|
92 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x12 DC2!!!!>"
|
93 |
-
elif b == 0x13:
|
94 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x13 DC3!!!!>"
|
95 |
-
elif b == 0x14:
|
96 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x14 DC4!!!!>"
|
97 |
-
elif b == 0x15:
|
98 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x15 NAK!!!!>"
|
99 |
-
elif b == 0x16:
|
100 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x16 SYN!!!!>"
|
101 |
-
elif b == 0x17:
|
102 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x17 ETB!!!!>"
|
103 |
-
elif b == 0x18:
|
104 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x18 CAN!!!!>"
|
105 |
-
elif b == 0x19:
|
106 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x19 EM!!!!>"
|
107 |
-
elif b == 0x1a:
|
108 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1a SUB!!!!>"
|
109 |
-
elif b == 0x1b:
|
110 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1b ESC!!!!>"
|
111 |
-
elif b == 0x1c:
|
112 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1c FS!!!!>"
|
113 |
-
elif b == 0x1d:
|
114 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1d GS!!!!>"
|
115 |
-
elif b == 0x1e:
|
116 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1e RS!!!!>"
|
117 |
-
elif b == 0x1f:
|
118 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x1f US!!!!>"
|
119 |
-
elif b == 0x20:
|
120 |
-
ret = "<!!!!VOCABVIEWER ASCII 0x20 SPC!!!!>"
|
121 |
elif b == 0x7f:
|
122 |
-
ret =
|
123 |
elif tok_enc == b"\xef\xbf\xbd":
|
124 |
ret = "<!!!!VOCABVIEWER REPLACEMENT CHARACTER U+FFFD!!!!>"
|
125 |
elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf":
|
126 |
ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
|
127 |
elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" :
|
128 |
ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
return ret
|
131 |
# sort
|
|
|
48 |
repo_id = vocab_list[selected_vocab]
|
49 |
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
|
50 |
|
51 |
+
def escape_byte(b):
|
52 |
+
ret = ""
|
53 |
+
if b == 0x00:
|
54 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x00 NUL!!!!>"
|
55 |
+
elif b == 0x01:
|
56 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x01 SOH!!!!>"
|
57 |
+
elif b == 0x02:
|
58 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x02 STX!!!!>"
|
59 |
+
elif b == 0x03:
|
60 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x03 ETX!!!!>"
|
61 |
+
elif b == 0x04:
|
62 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x04 EOT!!!!>"
|
63 |
+
elif b == 0x05:
|
64 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x05 ENQ!!!!>"
|
65 |
+
elif b == 0x06:
|
66 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x06 ACK!!!!>"
|
67 |
+
elif b == 0x07:
|
68 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x07 BEL!!!!>"
|
69 |
+
elif b == 0x08:
|
70 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x08 BS!!!!>"
|
71 |
+
elif b == 0x0a:
|
72 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0a LF!!!!>"
|
73 |
+
elif b == 0x0b:
|
74 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0b VT!!!!>"
|
75 |
+
elif b == 0x0c:
|
76 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0c FF!!!!>"
|
77 |
+
elif b == 0x0d:
|
78 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0d CR!!!!>"
|
79 |
+
elif b == 0x0e:
|
80 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0e SO!!!!>"
|
81 |
+
elif b == 0x0f:
|
82 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x0f SI!!!!>"
|
83 |
+
elif b == 0x10:
|
84 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x10 DLE!!!!>"
|
85 |
+
elif b == 0x11:
|
86 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x11 DC1!!!!>"
|
87 |
+
elif b == 0x12:
|
88 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x12 DC2!!!!>"
|
89 |
+
elif b == 0x13:
|
90 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x13 DC3!!!!>"
|
91 |
+
elif b == 0x14:
|
92 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x14 DC4!!!!>"
|
93 |
+
elif b == 0x15:
|
94 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x15 NAK!!!!>"
|
95 |
+
elif b == 0x16:
|
96 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x16 SYN!!!!>"
|
97 |
+
elif b == 0x17:
|
98 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x17 ETB!!!!>"
|
99 |
+
elif b == 0x18:
|
100 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x18 CAN!!!!>"
|
101 |
+
elif b == 0x19:
|
102 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x19 EM!!!!>"
|
103 |
+
elif b == 0x1a:
|
104 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1a SUB!!!!>"
|
105 |
+
elif b == 0x1b:
|
106 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1b ESC!!!!>"
|
107 |
+
elif b == 0x1c:
|
108 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1c FS!!!!>"
|
109 |
+
elif b == 0x1d:
|
110 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1d GS!!!!>"
|
111 |
+
elif b == 0x1e:
|
112 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1e RS!!!!>"
|
113 |
+
elif b == 0x1f:
|
114 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x1f US!!!!>"
|
115 |
+
elif b == 0x20:
|
116 |
+
# ret = "<!!!!VOCABVIEWER ASCII 0x20 SPC!!!!>"
|
117 |
+
ret = " "
|
118 |
+
elif b == 0x7f:
|
119 |
+
ret = "<!!!!VOCABVIEWER ASCII 0x7f DEL!!!!>"
|
120 |
+
|
121 |
+
return ret
|
122 |
+
|
123 |
def escape_decode(token):
|
124 |
ret = token
|
125 |
tok_enc = token.encode()
|
126 |
if len(tok_enc) == 1:
|
127 |
b = tok_enc[0]
|
128 |
+
if b >= 0x00 and b <= 0x08:
|
129 |
+
ret = escape_byte(b)
|
130 |
+
elif b >= 0x0a and b <= 0x19:
|
131 |
+
ret = escape_byte(b)
|
132 |
+
elif b >= 0x1a and b <= 0x20:
|
133 |
+
ret = escape_byte(b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
elif b == 0x7f:
|
135 |
+
ret = escape_byte(b)
|
136 |
elif tok_enc == b"\xef\xbf\xbd":
|
137 |
ret = "<!!!!VOCABVIEWER REPLACEMENT CHARACTER U+FFFD!!!!>"
|
138 |
elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf":
|
139 |
ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
|
140 |
elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" :
|
141 |
ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
|
142 |
+
else:
|
143 |
+
escape_tok_enc = bytes()
|
144 |
+
for b in tok_enc:
|
145 |
+
es = escape_byte(b)
|
146 |
+
if len(es) > 0:
|
147 |
+
escape_tok_enc += es.encode()
|
148 |
+
else:
|
149 |
+
# add byte
|
150 |
+
escape_tok_enc += b.to_bytes(1, "big")
|
151 |
+
|
152 |
+
ret = escape_tok_enc.decode()
|
153 |
|
154 |
return ret
|
155 |
# sort
|