HeshamHaroon commited on
Commit
71c7fc4
1 Parent(s): 8d000e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -19
app.py CHANGED
@@ -10,6 +10,7 @@ import os
10
  HF_TOKEN = os.getenv('HF_TOKEN')
11
 
12
  if HF_TOKEN:
 
13
  login(token=HF_TOKEN)
14
 
15
  # Load additional tokenizers from transformers
@@ -63,29 +64,16 @@ if meta_llama_tokenizer:
63
  tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
64
 
65
  def compare_tokenizers(tokenizer_name, text):
66
- # Handle the transformer tokenizers separately due to API differences
67
- if tokenizer_name in [
68
- "FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B",
69
- "inception-mbzuai/jais-13b", "aubmindlab/bert-base-arabertv2",
70
- "meta-llama/Meta-Llama-3-8B", "CohereForAI/c4ai-command-r-v01", "CohereForAI/c4ai-command-r-plus"
71
- ]:
72
- tokenizer = tokenizers[tokenizer_name]()
73
- tokens = tokenizer.tokenize(text)
74
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
75
- decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
76
- else:
77
- # AraNizer tokenizers
78
- tokenizer = tokenizers[tokenizer_name]()
79
- tokens = tokenizer.tokenize(text)
80
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
81
- decoded_text = tokenizer.decode(encoded_output)
82
-
83
  # Prepare the results to be displayed in HTML format
84
- tokens_arabic = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
85
  results_html = f"""
86
  <div>
87
  <h3>Tokenizer: {tokenizer_name}</h3>
88
- <p><strong>Tokens:</strong> {tokens_arabic}</p>
89
  <p><strong>Encoded:</strong> {encoded_output}</p>
90
  <p><strong>Decoded:</strong> {decoded_text}</p>
91
  </div>
 
10
  HF_TOKEN = os.getenv('HF_TOKEN')
11
 
12
  if HF_TOKEN:
13
+ HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
14
  login(token=HF_TOKEN)
15
 
16
  # Load additional tokenizers from transformers
 
64
  tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
65
 
66
  def compare_tokenizers(tokenizer_name, text):
67
+ tokenizer = tokenizers[tokenizer_name]()
68
+ tokens = tokenizer.tokenize(text)
69
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
70
+ decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
71
+
 
 
 
 
 
 
 
 
 
 
 
 
72
  # Prepare the results to be displayed in HTML format
 
73
  results_html = f"""
74
  <div>
75
  <h3>Tokenizer: {tokenizer_name}</h3>
76
+ <p><strong>Tokens:</strong> {tokens}</p>
77
  <p><strong>Encoded:</strong> {encoded_output}</p>
78
  <p><strong>Decoded:</strong> {decoded_text}</p>
79
  </div>