ggunio commited on
Commit
0a70666
Β·
1 Parent(s): 0250815

Add detailed explanation of chunking effects on token boundaries

Browse files

- Added warning about 64-byte limitation and independent chunk processing
- Explained why token counts differ from full-text tokenization
- Enhanced visualization with chunk information and byte counts
- Improved benchmark table with detailed metrics
- Fixed token counting algorithm to be more accurate
- Added comprehensive technical details about chunking effects

Files changed (1) hide show
  1. app.py +32 -11
app.py CHANGED
@@ -79,6 +79,7 @@ def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
79
  current_group = []
80
 
81
  for i in range(min(len(byte_seq), len(boundaries))):
 
82
  is_boundary = (i == 0) or (boundaries[i] == 1)
83
 
84
  if is_boundary and current_group:
@@ -205,7 +206,8 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
205
  boundaries = outputs[boundary_key]
206
  groups_visual = visualize_groups(byte_seq, boundaries)
207
  boundary_binary = torch.argmax(boundaries, dim=-1)[0]
208
- num_tokens = torch.sum(boundary_binary == 1).item() + 1
 
209
  break
210
 
211
  # If no boundaries found, show entire chunk as one token
@@ -324,14 +326,22 @@ def process_text_full(text: str, show_embeddings: bool = False):
324
  # Format groups visualization showing actual token boundaries
325
  groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
326
 
327
- # Show more chunks for shorter texts
 
 
 
 
 
 
 
 
 
328
  max_chunks_to_show = min(len(all_results), 5)
329
 
330
  for i, result in enumerate(all_results[:max_chunks_to_show]):
331
- groups_text += f"Chunk {i+1}: {result['groups']}\n"
332
- if result['num_tokens'] > 1:
333
- groups_text += f" β†’ {result['num_tokens']} tokens detected\n"
334
- groups_text += "\n"
335
 
336
  if len(all_results) > max_chunks_to_show:
337
  groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
@@ -361,21 +371,28 @@ def benchmark_languages():
361
  }
362
 
363
  results = "**Language Benchmark Results:**\n\n"
364
- results += "| Language | Compression | Accuracy |\n"
365
- results += "|----------|-------------|----------|\n"
366
 
367
  for lang, text in test_texts.items():
368
  stats, _, _, _, compression = process_text_full(text)
369
 
370
- # Extract accuracy from stats
371
  import re
 
 
 
372
  acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
 
 
 
 
373
  accuracy = acc_match.group(1) if acc_match else "N/A"
374
 
375
- results += f"| {lang:8} | {compression:7.1f}:1 | {accuracy:6}% |\n"
376
 
377
  results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
378
- results += "\n*Note: Performance based on 6 languages, may vary with 204 languages (v6.1.3)*"
379
 
380
  return results
381
 
@@ -403,6 +420,8 @@ with gr.Blocks(
403
  - **Token Boundary Visualization**: Shows model-learned token groups
404
  - **Embedding Display**: Visualize learned representations
405
  - **Streaming Support**: Process text in real-time
 
 
406
  """)
407
 
408
  with gr.Tab("Interactive Demo"):
@@ -522,6 +541,7 @@ with gr.Blocks(
522
  gr.Markdown("""
523
  ### Multi-Language Performance Benchmark
524
  Test compression performance across different language families.
 
525
  """)
526
 
527
  benchmark_btn = gr.Button("πŸ“Š Run Benchmark", variant="primary")
@@ -549,6 +569,7 @@ with gr.Blocks(
549
  - Learning-based compression without language rules
550
  - Cross-attention for sequence relationships
551
  - Model-learned token boundaries (not fixed chunks)
 
552
 
553
  ---
554
  *Note: v6.1.3 in training with 204 languages for universal coverage*
 
79
  current_group = []
80
 
81
  for i in range(min(len(byte_seq), len(boundaries))):
82
+ # Boundary value of 1 means start of new token
83
  is_boundary = (i == 0) or (boundaries[i] == 1)
84
 
85
  if is_boundary and current_group:
 
206
  boundaries = outputs[boundary_key]
207
  groups_visual = visualize_groups(byte_seq, boundaries)
208
  boundary_binary = torch.argmax(boundaries, dim=-1)[0]
209
+ # Count actual token groups
210
+ num_tokens = len([i for i, b in enumerate(boundary_binary[:len(byte_seq)]) if i == 0 or b == 1])
211
  break
212
 
213
  # If no boundaries found, show entire chunk as one token
 
326
  # Format groups visualization showing actual token boundaries
327
  groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
328
 
329
+ # Add important explanation about chunking effect
330
+ if len(all_results) > 1:
331
+ groups_text += "⚠️ **Important Note about Chunking:**\n"
332
+ groups_text += "- Model was trained on 64-byte chunks, so longer texts are split\n"
333
+ groups_text += "- Each chunk is tokenized **independently**\n"
334
+ groups_text += "- This causes token boundaries to differ from full-text processing\n"
335
+ groups_text += "- Example: 'ν•œκ΅­μ–΄λ„' might become 'ν•œκ΅­μ–΄' (chunk 1) + '도' (chunk 2)\n"
336
+ groups_text += "- Total token count may be higher due to split tokens\n\n"
337
+
338
+ # Show chunks and their boundaries
339
  max_chunks_to_show = min(len(all_results), 5)
340
 
341
  for i, result in enumerate(all_results[:max_chunks_to_show]):
342
+ groups_text += f"**Chunk {i+1}** ({result['original_bytes']} bytes): `{result['text']}`\n"
343
+ groups_text += f" Tokens: {result['groups']}\n"
344
+ groups_text += f" Count: {result['num_tokens']} tokens | Ratio: {result['compression_ratio']:.1f}:1\n\n"
 
345
 
346
  if len(all_results) > max_chunks_to_show:
347
  groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
 
371
  }
372
 
373
  results = "**Language Benchmark Results:**\n\n"
374
+ results += "| Language | Text Size | Chunks | Tokens | Compression | Accuracy |\n"
375
+ results += "|----------|-----------|--------|--------|-------------|----------|\n"
376
 
377
  for lang, text in test_texts.items():
378
  stats, _, _, _, compression = process_text_full(text)
379
 
380
+ # Extract metrics from stats
381
  import re
382
+ bytes_match = re.search(r'Original: (\d+) bytes', stats)
383
+ tokens_match = re.search(r'Compressed: (\d+) tokens', stats)
384
+ chunks_match = re.search(r'Chunks Processed: (\d+)', stats)
385
  acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
386
+
387
+ text_bytes = bytes_match.group(1) if bytes_match else "N/A"
388
+ tokens = tokens_match.group(1) if tokens_match else "N/A"
389
+ chunks = chunks_match.group(1) if chunks_match else "N/A"
390
  accuracy = acc_match.group(1) if acc_match else "N/A"
391
 
392
+ results += f"| {lang:8} | {text_bytes:>9}B | {chunks:>6} | {tokens:>6} | {compression:>9.1f}:1 | {accuracy:>7}% |\n"
393
 
394
  results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
395
+ results += "\n*Note: Longer texts require chunking, affecting token boundaries*"
396
 
397
  return results
398
 
 
420
  - **Token Boundary Visualization**: Shows model-learned token groups
421
  - **Embedding Display**: Visualize learned representations
422
  - **Streaming Support**: Process text in real-time
423
+
424
+ ⚠️ **Note on 64-byte limitation**: Model processes text in 64-byte chunks. Longer texts are split and each chunk is tokenized independently.
425
  """)
426
 
427
  with gr.Tab("Interactive Demo"):
 
541
  gr.Markdown("""
542
  ### Multi-Language Performance Benchmark
543
  Test compression performance across different language families.
544
+ Note: Results show chunking effect on longer texts.
545
  """)
546
 
547
  benchmark_btn = gr.Button("πŸ“Š Run Benchmark", variant="primary")
 
569
  - Learning-based compression without language rules
570
  - Cross-attention for sequence relationships
571
  - Model-learned token boundaries (not fixed chunks)
572
+ - **Chunking Effect**: Texts >62 bytes are split, each chunk tokenized independently
573
 
574
  ---
575
  *Note: v6.1.3 in training with 204 languages for universal coverage*