Spaces:
Runtime error
Runtime error
Commit
•
71c7fc4
1
Parent(s):
8d000e9
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import os
|
|
10 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
11 |
|
12 |
if HF_TOKEN:
|
|
|
13 |
login(token=HF_TOKEN)
|
14 |
|
15 |
# Load additional tokenizers from transformers
|
@@ -63,29 +64,16 @@ if meta_llama_tokenizer:
|
|
63 |
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
|
64 |
|
65 |
def compare_tokenizers(tokenizer_name, text):
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
]:
|
72 |
-
tokenizer = tokenizers[tokenizer_name]()
|
73 |
-
tokens = tokenizer.tokenize(text)
|
74 |
-
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
75 |
-
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
|
76 |
-
else:
|
77 |
-
# AraNizer tokenizers
|
78 |
-
tokenizer = tokenizers[tokenizer_name]()
|
79 |
-
tokens = tokenizer.tokenize(text)
|
80 |
-
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
81 |
-
decoded_text = tokenizer.decode(encoded_output)
|
82 |
-
|
83 |
# Prepare the results to be displayed in HTML format
|
84 |
-
tokens_arabic = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
|
85 |
results_html = f"""
|
86 |
<div>
|
87 |
<h3>Tokenizer: {tokenizer_name}</h3>
|
88 |
-
<p><strong>Tokens:</strong> {
|
89 |
<p><strong>Encoded:</strong> {encoded_output}</p>
|
90 |
<p><strong>Decoded:</strong> {decoded_text}</p>
|
91 |
</div>
|
|
|
10 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
11 |
|
12 |
if HF_TOKEN:
|
13 |
+
HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
|
14 |
login(token=HF_TOKEN)
|
15 |
|
16 |
# Load additional tokenizers from transformers
|
|
|
64 |
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
|
65 |
|
66 |
def compare_tokenizers(tokenizer_name, text):
|
67 |
+
tokenizer = tokenizers[tokenizer_name]()
|
68 |
+
tokens = tokenizer.tokenize(text)
|
69 |
+
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
70 |
+
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
|
71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
# Prepare the results to be displayed in HTML format
|
|
|
73 |
results_html = f"""
|
74 |
<div>
|
75 |
<h3>Tokenizer: {tokenizer_name}</h3>
|
76 |
+
<p><strong>Tokens:</strong> {tokens}</p>
|
77 |
<p><strong>Encoded:</strong> {encoded_output}</p>
|
78 |
<p><strong>Decoded:</strong> {decoded_text}</p>
|
79 |
</div>
|