xu-song commited on
Commit
b0c41e0
·
1 Parent(s): 70c093d
compression_app.py CHANGED
@@ -27,7 +27,8 @@ from compression_util import get_compression_leaderboard, common_corpuses
27
  # exactly reconstructed from compressed tokens
28
  docs = """## 📖 What is a good tokenizer?
29
 
30
- From a compression perspective, a good tokenizer should be lossless, and keep high compression rate (less tokens).
 
31
  The encoding and decoding process can be formulated as
32
  ```python
33
  token_ids = tokenizer.encode(input_text) # compressed tokens
@@ -142,9 +143,9 @@ with gr.Blocks(theme=theme) as demo:
142
  )
143
 
144
  gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
145
- "The leaderboard aims to evaluate tokenizer performance on different languages.\n"
146
  "Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
147
- "Higher `char/token` means less words be segmented into subwords."
148
  )
149
  search_bar = gr.Textbox(
150
  placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
 
27
  # exactly reconstructed from compressed tokens
28
  docs = """## 📖 What is a good tokenizer?
29
 
30
+ From a compression perspective, a good tokenizer should be lossless,
31
+ and keep high compression rate (less tokens for a given text).
32
  The encoding and decoding process can be formulated as
33
  ```python
34
  token_ids = tokenizer.encode(input_text) # compressed tokens
 
143
  )
144
 
145
  gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
146
+ "This leaderboard aims to evaluate tokenizer performance on different languages.\n"
147
  "Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
148
+ "Lower `char/token` means more words might be segmented into subwords."
149
  )
150
  search_bar = gr.Textbox(
151
  placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
compression_util.py CHANGED
@@ -297,9 +297,7 @@ def get_compression_leaderboard(
297
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
298
  stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
299
  stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
300
-
301
  # stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
302
-
303
  stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
304
  return stats
305
 
 
297
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
298
  stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
299
  stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
 
300
  # stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
 
301
  stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
302
  return stats
303
 
stats/compression_rate.json CHANGED
The diff for this file is too large to render. See raw diff