Add detailed explanation of chunking effects on token boundaries
Browse files- Added warning about 64-byte limitation and independent chunk processing
- Explained why token counts differ from full-text tokenization
- Enhanced visualization with chunk information and byte counts
- Improved benchmark table with detailed metrics
- Fixed token counting algorithm to be more accurate
- Added comprehensive technical details about chunking effects
app.py
CHANGED
|
@@ -79,6 +79,7 @@ def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
|
|
| 79 |
current_group = []
|
| 80 |
|
| 81 |
for i in range(min(len(byte_seq), len(boundaries))):
|
|
|
|
| 82 |
is_boundary = (i == 0) or (boundaries[i] == 1)
|
| 83 |
|
| 84 |
if is_boundary and current_group:
|
|
@@ -205,7 +206,8 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
|
|
| 205 |
boundaries = outputs[boundary_key]
|
| 206 |
groups_visual = visualize_groups(byte_seq, boundaries)
|
| 207 |
boundary_binary = torch.argmax(boundaries, dim=-1)[0]
|
| 208 |
-
|
|
|
|
| 209 |
break
|
| 210 |
|
| 211 |
# If no boundaries found, show entire chunk as one token
|
|
@@ -324,14 +326,22 @@ def process_text_full(text: str, show_embeddings: bool = False):
|
|
| 324 |
# Format groups visualization showing actual token boundaries
|
| 325 |
groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
|
| 326 |
|
| 327 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
max_chunks_to_show = min(len(all_results), 5)
|
| 329 |
|
| 330 |
for i, result in enumerate(all_results[:max_chunks_to_show]):
|
| 331 |
-
groups_text += f"Chunk {i+1}: {result['
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
groups_text += "\n"
|
| 335 |
|
| 336 |
if len(all_results) > max_chunks_to_show:
|
| 337 |
groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
|
|
@@ -361,21 +371,28 @@ def benchmark_languages():
|
|
| 361 |
}
|
| 362 |
|
| 363 |
results = "**Language Benchmark Results:**\n\n"
|
| 364 |
-
results += "| Language | Compression | Accuracy |\n"
|
| 365 |
-
results += "
|
| 366 |
|
| 367 |
for lang, text in test_texts.items():
|
| 368 |
stats, _, _, _, compression = process_text_full(text)
|
| 369 |
|
| 370 |
-
# Extract
|
| 371 |
import re
|
|
|
|
|
|
|
|
|
|
| 372 |
acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
accuracy = acc_match.group(1) if acc_match else "N/A"
|
| 374 |
|
| 375 |
-
results += f"| {lang:8} | {compression
|
| 376 |
|
| 377 |
results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
|
| 378 |
-
results += "\n*Note:
|
| 379 |
|
| 380 |
return results
|
| 381 |
|
|
@@ -403,6 +420,8 @@ with gr.Blocks(
|
|
| 403 |
- **Token Boundary Visualization**: Shows model-learned token groups
|
| 404 |
- **Embedding Display**: Visualize learned representations
|
| 405 |
- **Streaming Support**: Process text in real-time
|
|
|
|
|
|
|
| 406 |
""")
|
| 407 |
|
| 408 |
with gr.Tab("Interactive Demo"):
|
|
@@ -522,6 +541,7 @@ with gr.Blocks(
|
|
| 522 |
gr.Markdown("""
|
| 523 |
### Multi-Language Performance Benchmark
|
| 524 |
Test compression performance across different language families.
|
|
|
|
| 525 |
""")
|
| 526 |
|
| 527 |
benchmark_btn = gr.Button("π Run Benchmark", variant="primary")
|
|
@@ -549,6 +569,7 @@ with gr.Blocks(
|
|
| 549 |
- Learning-based compression without language rules
|
| 550 |
- Cross-attention for sequence relationships
|
| 551 |
- Model-learned token boundaries (not fixed chunks)
|
|
|
|
| 552 |
|
| 553 |
---
|
| 554 |
*Note: v6.1.3 in training with 204 languages for universal coverage*
|
|
|
|
| 79 |
current_group = []
|
| 80 |
|
| 81 |
for i in range(min(len(byte_seq), len(boundaries))):
|
| 82 |
+
# Boundary value of 1 means start of new token
|
| 83 |
is_boundary = (i == 0) or (boundaries[i] == 1)
|
| 84 |
|
| 85 |
if is_boundary and current_group:
|
|
|
|
| 206 |
boundaries = outputs[boundary_key]
|
| 207 |
groups_visual = visualize_groups(byte_seq, boundaries)
|
| 208 |
boundary_binary = torch.argmax(boundaries, dim=-1)[0]
|
| 209 |
+
# Count actual token groups
|
| 210 |
+
num_tokens = len([i for i, b in enumerate(boundary_binary[:len(byte_seq)]) if i == 0 or b == 1])
|
| 211 |
break
|
| 212 |
|
| 213 |
# If no boundaries found, show entire chunk as one token
|
|
|
|
| 326 |
# Format groups visualization showing actual token boundaries
|
| 327 |
groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
|
| 328 |
|
| 329 |
+
# Add important explanation about chunking effect
|
| 330 |
+
if len(all_results) > 1:
|
| 331 |
+
groups_text += "β οΈ **Important Note about Chunking:**\n"
|
| 332 |
+
groups_text += "- Model was trained on 64-byte chunks, so longer texts are split\n"
|
| 333 |
+
groups_text += "- Each chunk is tokenized **independently**\n"
|
| 334 |
+
groups_text += "- This causes token boundaries to differ from full-text processing\n"
|
| 335 |
+
groups_text += "- Example: 'νκ΅μ΄λ' might become 'νκ΅μ΄' (chunk 1) + 'λ' (chunk 2)\n"
|
| 336 |
+
groups_text += "- Total token count may be higher due to split tokens\n\n"
|
| 337 |
+
|
| 338 |
+
# Show chunks and their boundaries
|
| 339 |
max_chunks_to_show = min(len(all_results), 5)
|
| 340 |
|
| 341 |
for i, result in enumerate(all_results[:max_chunks_to_show]):
|
| 342 |
+
groups_text += f"**Chunk {i+1}** ({result['original_bytes']} bytes): `{result['text']}`\n"
|
| 343 |
+
groups_text += f" Tokens: {result['groups']}\n"
|
| 344 |
+
groups_text += f" Count: {result['num_tokens']} tokens | Ratio: {result['compression_ratio']:.1f}:1\n\n"
|
|
|
|
| 345 |
|
| 346 |
if len(all_results) > max_chunks_to_show:
|
| 347 |
groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
|
|
|
|
| 371 |
}
|
| 372 |
|
| 373 |
results = "**Language Benchmark Results:**\n\n"
|
| 374 |
+
results += "| Language | Text Size | Chunks | Tokens | Compression | Accuracy |\n"
|
| 375 |
+
results += "|----------|-----------|--------|--------|-------------|----------|\n"
|
| 376 |
|
| 377 |
for lang, text in test_texts.items():
|
| 378 |
stats, _, _, _, compression = process_text_full(text)
|
| 379 |
|
| 380 |
+
# Extract metrics from stats
|
| 381 |
import re
|
| 382 |
+
bytes_match = re.search(r'Original: (\d+) bytes', stats)
|
| 383 |
+
tokens_match = re.search(r'Compressed: (\d+) tokens', stats)
|
| 384 |
+
chunks_match = re.search(r'Chunks Processed: (\d+)', stats)
|
| 385 |
acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
|
| 386 |
+
|
| 387 |
+
text_bytes = bytes_match.group(1) if bytes_match else "N/A"
|
| 388 |
+
tokens = tokens_match.group(1) if tokens_match else "N/A"
|
| 389 |
+
chunks = chunks_match.group(1) if chunks_match else "N/A"
|
| 390 |
accuracy = acc_match.group(1) if acc_match else "N/A"
|
| 391 |
|
| 392 |
+
results += f"| {lang:8} | {text_bytes:>9}B | {chunks:>6} | {tokens:>6} | {compression:>9.1f}:1 | {accuracy:>7}% |\n"
|
| 393 |
|
| 394 |
results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
|
| 395 |
+
results += "\n*Note: Longer texts require chunking, affecting token boundaries*"
|
| 396 |
|
| 397 |
return results
|
| 398 |
|
|
|
|
| 420 |
- **Token Boundary Visualization**: Shows model-learned token groups
|
| 421 |
- **Embedding Display**: Visualize learned representations
|
| 422 |
- **Streaming Support**: Process text in real-time
|
| 423 |
+
|
| 424 |
+
β οΈ **Note on 64-byte limitation**: Model processes text in 64-byte chunks. Longer texts are split and each chunk is tokenized independently.
|
| 425 |
""")
|
| 426 |
|
| 427 |
with gr.Tab("Interactive Demo"):
|
|
|
|
| 541 |
gr.Markdown("""
|
| 542 |
### Multi-Language Performance Benchmark
|
| 543 |
Test compression performance across different language families.
|
| 544 |
+
Note: Results show chunking effect on longer texts.
|
| 545 |
""")
|
| 546 |
|
| 547 |
benchmark_btn = gr.Button("π Run Benchmark", variant="primary")
|
|
|
|
| 569 |
- Learning-based compression without language rules
|
| 570 |
- Cross-attention for sequence relationships
|
| 571 |
- Model-learned token boundaries (not fixed chunks)
|
| 572 |
+
- **Chunking Effect**: Texts >62 bytes are split, each chunk tokenized independently
|
| 573 |
|
| 574 |
---
|
| 575 |
*Note: v6.1.3 in training with 204 languages for universal coverage*
|