Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

ggunio commited on Sep 25

Commit

0a70666

1 Parent(s): 0250815

Add detailed explanation of chunking effects on token boundaries

- Added warning about 64-byte limitation and independent chunk processing
- Explained why token counts differ from full-text tokenization
- Enhanced visualization with chunk information and byte counts
- Improved benchmark table with detailed metrics
- Fixed token counting algorithm to be more accurate
- Added comprehensive technical details about chunking effects

Files changed (1) hide show

app.py +32 -11

app.py CHANGED Viewed

@@ -79,6 +79,7 @@ def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
     current_group = []
     for i in range(min(len(byte_seq), len(boundaries))):
         is_boundary = (i == 0) or (boundaries[i] == 1)
         if is_boundary and current_group:
@@ -205,7 +206,8 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
             boundaries = outputs[boundary_key]
             groups_visual = visualize_groups(byte_seq, boundaries)
             boundary_binary = torch.argmax(boundaries, dim=-1)[0]
-            num_tokens = torch.sum(boundary_binary == 1).item() + 1
             break
     # If no boundaries found, show entire chunk as one token
@@ -324,14 +326,22 @@ def process_text_full(text: str, show_embeddings: bool = False):
         # Format groups visualization showing actual token boundaries
         groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
-        # Show more chunks for shorter texts
         max_chunks_to_show = min(len(all_results), 5)
         for i, result in enumerate(all_results[:max_chunks_to_show]):
-            groups_text += f"Chunk {i+1}: {result['groups']}\n"
-            if result['num_tokens'] > 1:
-                groups_text += f"  → {result['num_tokens']} tokens detected\n"
-            groups_text += "\n"
         if len(all_results) > max_chunks_to_show:
             groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
@@ -361,21 +371,28 @@ def benchmark_languages():
     }
     results = "**Language Benchmark Results:**\n\n"
-    results += "| Language | Compression | Accuracy |\n"
-    results += "|----------|-------------|----------|\n"
     for lang, text in test_texts.items():
         stats, _, _, _, compression = process_text_full(text)
-        # Extract accuracy from stats
         import re
         acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
         accuracy = acc_match.group(1) if acc_match else "N/A"
-        results += f"| {lang:8} | {compression:7.1f}:1 | {accuracy:6}% |\n"
     results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
-    results += "\n*Note: Performance based on 6 languages, may vary with 204 languages (v6.1.3)*"
     return results
@@ -403,6 +420,8 @@ with gr.Blocks(
     - **Token Boundary Visualization**: Shows model-learned token groups
     - **Embedding Display**: Visualize learned representations
     - **Streaming Support**: Process text in real-time
     """)
     with gr.Tab("Interactive Demo"):
@@ -522,6 +541,7 @@ with gr.Blocks(
         gr.Markdown("""
         ### Multi-Language Performance Benchmark
         Test compression performance across different language families.
         """)
         benchmark_btn = gr.Button("📊 Run Benchmark", variant="primary")
@@ -549,6 +569,7 @@ with gr.Blocks(
     - Learning-based compression without language rules
     - Cross-attention for sequence relationships
     - Model-learned token boundaries (not fixed chunks)
     ---
     *Note: v6.1.3 in training with 204 languages for universal coverage*

     current_group = []
     for i in range(min(len(byte_seq), len(boundaries))):
+        # Boundary value of 1 means start of new token
         is_boundary = (i == 0) or (boundaries[i] == 1)
         if is_boundary and current_group:
             boundaries = outputs[boundary_key]
             groups_visual = visualize_groups(byte_seq, boundaries)
             boundary_binary = torch.argmax(boundaries, dim=-1)[0]
+            # Count actual token groups
+            num_tokens = len([i for i, b in enumerate(boundary_binary[:len(byte_seq)]) if i == 0 or b == 1])
             break
     # If no boundaries found, show entire chunk as one token
         # Format groups visualization showing actual token boundaries
         groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
+        # Add important explanation about chunking effect
+        if len(all_results) > 1:
+            groups_text += "⚠️ **Important Note about Chunking:**\n"
+            groups_text += "- Model was trained on 64-byte chunks, so longer texts are split\n"
+            groups_text += "- Each chunk is tokenized **independently**\n"
+            groups_text += "- This causes token boundaries to differ from full-text processing\n"
+            groups_text += "- Example: '한국어도' might become '한국어' (chunk 1) + '도' (chunk 2)\n"
+            groups_text += "- Total token count may be higher due to split tokens\n\n"
+        # Show chunks and their boundaries
         max_chunks_to_show = min(len(all_results), 5)
         for i, result in enumerate(all_results[:max_chunks_to_show]):
+            groups_text += f"**Chunk {i+1}** ({result['original_bytes']} bytes): `{result['text']}`\n"
+            groups_text += f"  Tokens: {result['groups']}\n"
+            groups_text += f"  Count: {result['num_tokens']} tokens | Ratio: {result['compression_ratio']:.1f}:1\n\n"
         if len(all_results) > max_chunks_to_show:
             groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
     }
     results = "**Language Benchmark Results:**\n\n"
+    results += "| Language | Text Size | Chunks | Tokens | Compression | Accuracy |\n"
+    results += "|----------|-----------|--------|--------|-------------|----------|\n"
     for lang, text in test_texts.items():
         stats, _, _, _, compression = process_text_full(text)
+        # Extract metrics from stats
         import re
+        bytes_match = re.search(r'Original: (\d+) bytes', stats)
+        tokens_match = re.search(r'Compressed: (\d+) tokens', stats)
+        chunks_match = re.search(r'Chunks Processed: (\d+)', stats)
         acc_match = re.search(r'Reconstruction Accuracy: \*\*(\d+\.?\d*)', stats)
+        text_bytes = bytes_match.group(1) if bytes_match else "N/A"
+        tokens = tokens_match.group(1) if tokens_match else "N/A"
+        chunks = chunks_match.group(1) if chunks_match else "N/A"
         accuracy = acc_match.group(1) if acc_match else "N/A"
+        results += f"| {lang:8} | {text_bytes:>9}B | {chunks:>6} | {tokens:>6} | {compression:>9.1f}:1 | {accuracy:>7}% |\n"
     results += "\n**Average: 18.6:1 compression** (tested on best_model.pt)"
+    results += "\n*Note: Longer texts require chunking, affecting token boundaries*"
     return results
     - **Token Boundary Visualization**: Shows model-learned token groups
     - **Embedding Display**: Visualize learned representations
     - **Streaming Support**: Process text in real-time
+    ⚠️ **Note on 64-byte limitation**: Model processes text in 64-byte chunks. Longer texts are split and each chunk is tokenized independently.
     """)
     with gr.Tab("Interactive Demo"):
         gr.Markdown("""
         ### Multi-Language Performance Benchmark
         Test compression performance across different language families.
+        Note: Results show chunking effect on longer texts.
         """)
         benchmark_btn = gr.Button("📊 Run Benchmark", variant="primary")
     - Learning-based compression without language rules
     - Cross-attention for sequence relationships
     - Model-learned token boundaries (not fixed chunks)
+    - **Chunking Effect**: Texts >62 bytes are split, each chunk tokenized independently
     ---
     *Note: v6.1.3 in training with 204 languages for universal coverage*