Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="tokeniser-py Demonstration", | |
| page_icon="π£", | |
| layout="wide", | |
| ) | |
| # Custom CSS for better UI | |
| st.markdown(""" | |
| <style> | |
| .main { | |
| background-color: #0e1117; | |
| color: white; | |
| } | |
| .stTextInput > div > div > input, .stTextArea > div > div > textarea { | |
| background-color: #1e2130; | |
| color: white; | |
| border: 1px solid #30343e; | |
| border-radius: 4px; | |
| padding: 10px; | |
| } | |
| .token-display { | |
| margin-top: 20px; | |
| padding: 15px; | |
| border-radius: 5px; | |
| background-color: #1e2130; | |
| line-height: 2; | |
| overflow-wrap: break-word; | |
| } | |
| .token { | |
| display: inline-block; | |
| padding: 2px 4px; | |
| margin: 2px; | |
| border-radius: 3px; | |
| position: relative; | |
| cursor: pointer; | |
| color: #0e1117 !important; | |
| font-weight: 600; | |
| text-shadow: 0px 0px 1px rgba(0,0,0,0.2); | |
| } | |
| .token:hover::after { | |
| content: attr(data-id); | |
| position: absolute; | |
| top: -25px; | |
| left: 0; | |
| background: #3c4356; | |
| color: white; | |
| padding: 2px 6px; | |
| border-radius: 3px; | |
| font-size: 12px; | |
| white-space: nowrap; | |
| z-index: 100; | |
| } | |
| .button-container { | |
| display: flex; | |
| gap: 10px; | |
| margin-bottom: 15px; | |
| } | |
| .stButton button { | |
| background-color: #2c313d; | |
| border: none; | |
| color: white; | |
| } | |
| .stButton button:hover { | |
| background-color: #3c4356; | |
| } | |
| .info-box { | |
| margin-top: 20px; | |
| padding: 20px; | |
| border-radius: 5px; | |
| background-color: #1e2130; | |
| font-size: 14px; | |
| line-height: 1.6; | |
| } | |
| .quote { | |
| border-left: 4px solid #00ba7c; | |
| padding-left: 10px; | |
| margin: 10px 0; | |
| color: #e0e0e0; | |
| } | |
| .highlight { | |
| background-color: rgba(0, 186, 124, 0.15); | |
| padding: 2px 4px; | |
| border-radius: 3px; | |
| font-weight: 500; | |
| } | |
| .comparison-table { | |
| background-color: #262b38; | |
| padding: 15px; | |
| border-radius: 5px; | |
| margin: 15px 0; | |
| } | |
| .section-title { | |
| font-weight: 600; | |
| margin-top: 15px; | |
| margin-bottom: 8px; | |
| color: #00ba7c; | |
| } | |
| .stRadio [role=radiogroup] { | |
| background-color: #1e2130; | |
| padding: 5px; | |
| border-radius: 5px; | |
| } | |
| .header-container { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| padding: 10px 0; | |
| margin-top: -40px; | |
| } | |
| .stats-container { | |
| display: flex; | |
| gap: 20px; | |
| padding: 10px; | |
| background-color: #1e2130; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| } | |
| .stat-box { | |
| padding: 10px; | |
| } | |
| .stat-label { | |
| font-size: 0.9em; | |
| color: #aaa; | |
| } | |
| .stat-value { | |
| font-size: 1.5em; | |
| font-weight: bold; | |
| } | |
| a { | |
| color: #00ba7c !important; | |
| text-decoration: none; | |
| } | |
| a:hover { | |
| text-decoration: underline; | |
| } | |
| .monospace { | |
| font-family: monospace; | |
| } | |
| .note-box { | |
| background-color: rgba(255, 204, 0, 0.1); | |
| border-left: 3px solid rgba(255, 204, 0, 0.7); | |
| padding: 10px 15px; | |
| margin: 10px 0; | |
| border-radius: 0 5px 5px 0; | |
| } | |
| .buttons-row { | |
| display: flex; | |
| gap: 10px; | |
| } | |
| /* Enhanced bullet points styling */ | |
| .bullet-point { | |
| display: flex; | |
| align-items: baseline; | |
| margin: 8px 0; | |
| padding: 4px 0; | |
| } | |
| .bullet-point-icon { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| min-width: 24px; | |
| height: 24px; | |
| background-color: rgba(0, 186, 124, 0.2); | |
| color: #00ba7c; | |
| border-radius: 50%; | |
| margin-right: 10px; | |
| font-weight: bold; | |
| } | |
| .secondary-bullet { | |
| background-color: rgba(0, 186, 124, 0.1); | |
| } | |
| .comparison-item { | |
| display: flex; | |
| align-items: baseline; | |
| margin: 10px 0; | |
| padding: 6px 0; | |
| } | |
| .comparison-icon { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| min-width: 28px; | |
| height: 28px; | |
| background-color: rgba(0, 186, 124, 0.25); | |
| color: #00ba7c; | |
| border-radius: 50%; | |
| margin-right: 12px; | |
| font-weight: bold; | |
| } | |
| .comparison-text { | |
| flex: 1; | |
| } | |
| .learn-more-section { | |
| background-color: #1e2130; | |
| border-radius: 5px; | |
| padding: 20px; | |
| } | |
| .icon-wrapper { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .colored-icon { | |
| display: inline-block; | |
| color: #00ba7c; | |
| font-size: 1.4em; | |
| margin-right: 10px; | |
| } | |
| .library-feature { | |
| display: flex; | |
| align-items: baseline; | |
| margin: 10px 0; | |
| } | |
| .feature-dot { | |
| min-width: 18px; | |
| height: 18px; | |
| background-color: rgba(0, 186, 124, 0.2); | |
| border-radius: 50%; | |
| margin-right: 10px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .feature-text { | |
| flex: 1; | |
| } | |
| .sub-feature { | |
| display: flex; | |
| padding-left: 30px; | |
| margin: 8px 0; | |
| align-items: baseline; | |
| } | |
| .sub-feature-dot { | |
| min-width: 12px; | |
| height: 12px; | |
| background-color: rgba(0, 186, 124, 0.1); | |
| border-radius: 50%; | |
| margin-right: 10px; | |
| } | |
| .code-block { | |
| background-color: #0e1117; | |
| padding: 15px; | |
| border-radius: 5px; | |
| font-family: 'Courier New', monospace; | |
| margin: 15px 0; | |
| color: #e0e0e0; | |
| border-left: 3px solid #00ba7c; | |
| } | |
| .code-line { | |
| padding: 2px 0; | |
| display: block; | |
| } | |
| .code-import { | |
| color: #ff79c6; | |
| } | |
| .code-class { | |
| color: #8be9fd; | |
| } | |
| .code-function { | |
| color: #50fa7b; | |
| } | |
| .code-var { | |
| color: #f1fa8c; | |
| } | |
| .code-string { | |
| color: #f1fa8c; | |
| } | |
| .code-comment { | |
| color: #6272a4; | |
| } | |
| .link-top-a{ | |
| color: rgb(72, 140, 255) !important; | |
| font-size: 18px; | |
| } | |
| .link-top{ | |
| color: rgb(180, 220, 255) !important; | |
| font-size: 18px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Header with logo and title | |
| st.markdown(""" | |
| <div class="header-container"> | |
| <div> | |
| <h1>tokeniser-py π£</h1> | |
| <a href = "https://github.com/Tasmay-Tibrewal/tokeniser-py" class="link-top-a" style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">Library GitHub (tokeniser-py)</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://github.com/Tasmay-Tibrewal/tokeniser-py-lite" class="link-top-a" style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">Library GitHub (tokeniser-py-lite)</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://huggingface.co/datasets/Tasmay-Tib/Tokeniser" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">HF Dataset (unchunked)</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://github.com/Tasmay-Tibrewal/Tokeniser" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">GitHub Dataset (chunked)</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://github.com/Tasmay-Tibrewal/Tokeniser-imp" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">GitHub Imp Files</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://pypi.org/project/tokeniser-py/" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">PyPI Package (Main Lib)</span></a> | |
| <p class="link-top" style="display: inline;"> | </p> | |
| <a href = "https://pypi.org/project/tokeniser-py-lite/" class="link-top-a"style="display: inline;"><span style="background-color:rgba(100,146,154,0.17); padding:2px 4px; border-radius:3px;">PyPI Package (Lite Lib)</span></a> | |
| <p></p> | |
| <p style="font-size: 20px;"><strong>Learn about language model tokenization</strong></p> | |
| <p style="font-size: 17px; margin-bottom: 5px;"> | |
| <span style="background-color:rgba(154, 187, 255,0.4); padding:2px 4px; border-radius:3px;">tokeniser-py's</span> custom tokenizer processes text using tokens, which are common sequences of characters found in a set of text. The model learns to understand the statistical relationships | |
| between these tokens, and excel at producing the next token in a sequence of tokens. You can use the tool below to understand how a piece of text might be tokenized by a language model, and the total count of tokens in that piece of text. | |
| </p> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Initialize tokenizer | |
| def load_tokenizer(ln="1b", token_ordered=False): | |
| try: | |
| from tokeniser import Tokeniser | |
| # Pass parameters based on selection | |
| return Tokeniser(ln=ln, token_ordered=token_ordered) | |
| except Exception as e: | |
| st.error(f"Error loading tokenizer: {e}") | |
| return None | |
| # Information about tokenization | |
| # st.markdown(""" | |
| # """) | |
| # st.markdown("") | |
| # st.markdown("") | |
| st.markdown("###### Model") | |
| # Create tabs for different models | |
| model_version = st.radio( | |
| "", | |
| ["Default (1b model unordered)", "1b model ordered", "0.5b model unordered", "0.5b model ordered"], | |
| horizontal=True | |
| ) | |
| # Map selected model version to parameters | |
| if model_version == "Default (1b model unordered)": | |
| ln_param = "1b" | |
| ordered_param = False | |
| elif model_version == "1b model ordered": | |
| ln_param = "1b" | |
| ordered_param = True | |
| elif model_version == "0.5b model unordered": | |
| ln_param = "0.5b" | |
| ordered_param = False | |
| else: | |
| ln_param = "0.5b" | |
| ordered_param = True | |
| # Load tokenizer with selected parameters | |
| tokenizer = load_tokenizer(ln=ln_param, token_ordered=ordered_param) | |
| # Function to generate consistent pastel colors for tokens | |
| def get_token_colors(tokens): | |
| # Use hash of token to get consistent colors | |
| colors = {} | |
| for token in set(tokens): | |
| # Generate a pastel color based on the hash of the token | |
| hash_val = hash(token) % 360 | |
| colors[token] = f"hsl({hash_val}, 80%, 75%)" | |
| return colors | |
| # Function to display tokens with colors and hover effects | |
| def display_colored_tokens(tokens, token_ids, token_colors): | |
| html = "" | |
| for i, (token, token_id) in enumerate(zip(tokens, token_ids)): | |
| # Handle special characters for display | |
| if token == '\n': | |
| display_token = '\\n' | |
| elif token == '\t': | |
| display_token = '\\t' | |
| else: | |
| display_token = token.replace("<", "<").replace(">", ">").replace(" ", " ") | |
| html += f'<span class="token" style="background-color: {token_colors[token]};" data-id="{token_id}">{display_token}</span>' | |
| return html | |
| # Function to display token IDs | |
| def display_token_ids(token_ids): | |
| return f'<div class="monospace">{json.dumps(token_ids)}</div>' | |
| # Initialize session state for text input if not exists | |
| if 'text_input' not in st.session_state: | |
| st.session_state.text_input = "Hi I am Tasmay, I am a third year undergraduate at IIT Kharagpur and this is my tokeniser. Please enter your text in this box" | |
| st.session_state.text_ind = 0 | |
| print(st.session_state.text_ind) | |
| st.markdown("###### Enter text to tokenize") | |
| # Text input area | |
| text_input = st.text_area( | |
| "", | |
| st.session_state.text_input, | |
| height=150, | |
| placeholder="Please enter the text to tokenise", | |
| # on_change=handle_text_change, | |
| ) | |
| def clear_text(): | |
| st.session_state.text_input = "" | |
| def show_example(): | |
| examples = [ | |
| "Hi I am Tasmay, I am a third year undergraduate at IIT Kharagpur and this is my tokeniser. Please enter your text in this box", | |
| "Wop, wop, wop, wop, wop, I'ma do my stuff", | |
| "I got loyalty, got royalty inside my DNA", | |
| "Sit down, be humble", | |
| "We gon' be alright" | |
| ] | |
| st.session_state.text_ind = (st.session_state.text_ind + 1) % len(examples) | |
| st.session_state.text_input = examples[st.session_state.text_ind] | |
| # Add CSS for fixed-width buttons that wrap to new line | |
| st.markdown(""" | |
| <style> | |
| div[data-testid="stHorizontalBlock"] { | |
| flex-wrap: wrap; | |
| gap: 10px; | |
| margin-top: -15px; | |
| padding-top: 0px; | |
| margin-bottom: -15px; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div { | |
| flex: 0 0 auto !important; | |
| width: auto !important; | |
| min-width: initial !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] button { | |
| width: 80px; /* Fixed width for "Clear" button */ | |
| margin-top: 0px; | |
| } | |
| div[data-testid="stHorizontalBlock"] div:nth-child(2) button { | |
| margin-top: 0px; | |
| width: 150px; /* Fixed width for "Show example" button */ | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Create a horizontal block for buttons | |
| button_container = st.container() | |
| with button_container: | |
| cols = st.columns([1, 1, 10]) | |
| with cols[0]: | |
| st.button("Clear", on_click=clear_text) | |
| with cols[1]: | |
| st.button("Show example", on_click=show_example) | |
| # Process the text for tokenization | |
| if tokenizer: | |
| try: | |
| tokens, count = tokenizer.tokenise(text_input) | |
| token_ids = tokenizer.token_ids(tokens) | |
| num_tokens = len(tokens) | |
| num_chars = len(text_input) | |
| chars_per_token = num_chars / num_tokens if num_tokens > 0 else 0 | |
| except Exception as e: | |
| st.error(f"Error tokenizing text: {e}") | |
| tokens = [] | |
| token_ids = [] | |
| num_tokens = 0 | |
| num_chars = 0 | |
| chars_per_token = 0 | |
| # Inject custom CSS | |
| st.markdown( | |
| """ | |
| <style> | |
| div[role="radiogroup"] > label { | |
| height: 40px !important; | |
| padding-left: 10px; | |
| display: flex; | |
| align-items: center; | |
| } | |
| div[role="radiogroup"] { | |
| margin-top: -30px; | |
| margin-bottom: 0px; | |
| } | |
| div[data-testid="stTextArea"] { | |
| margin-top: -30px; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # st.markdown("###### View") | |
| # Create view toggle | |
| view_option = st.radio( | |
| "", | |
| ["Text", "Token IDs"], | |
| horizontal=True | |
| ) | |
| # Get token colors if we have tokens | |
| token_colors = get_token_colors(tokens) if tokens else {} | |
| # Always display the token display, even if empty | |
| if view_option == "Text": | |
| if tokens: | |
| st.markdown(f'<div class="token-display" style="margin-top: -25px;">{display_colored_tokens(tokens, token_ids, token_colors)}</div>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<div class="token-display" style="margin-top: -25px;">No tokens to display</div>', unsafe_allow_html=True) | |
| else: | |
| if token_ids: | |
| st.markdown(f'<div class="token-display" style="margin-top: -25px;">{display_token_ids(token_ids)}</div>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<div class="token-display" style="margin-top: -25px;">No token IDs to display</div>', unsafe_allow_html=True) | |
| # Always display the stats container, even if empty | |
| st.markdown(""" | |
| <div class="stats-container" style="margin-top: -10px; margin-bottom: 10px;"> | |
| <div class="stat-box"> | |
| <div class="stat-label">Tokens</div> | |
| <div class="stat-value">{}</div> | |
| </div> | |
| <div class="stat-box"> | |
| <div class="stat-label">Characters</div> | |
| <div class="stat-value">{}</div> | |
| </div> | |
| <div class="stat-box"> | |
| <div class="stat-label">Chars per token</div> | |
| <div class="stat-value">{:.2f}</div> | |
| </div> | |
| </div> | |
| """.format(num_tokens, num_chars, chars_per_token), | |
| unsafe_allow_html=True) | |
| # Information box split into multiple markdown elements for better rendering | |
| # st.markdown("<div class='info-box'>", unsafe_allow_html=True) | |
| # Section 1: Tokenization Efficiency | |
| st.markdown("---") | |
| st.markdown("<h3 style='color:#00ba7c; margin-top:10px;'>Tokenization Efficiency</h3>", unsafe_allow_html=True) | |
| # Quote block | |
| st.markdown(""" | |
| <div style="border-left: 4px solid #00ba7c; padding-left: 15px; margin: 15px 0; color: #e0e0e0;"> | |
| A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for | |
| common English text. This translates to roughly ΒΎ of a word (so 100 tokens ~= 75 words). | |
| <div style="font-style: italic; color: #aaa; margin-top: 5px;">β OpenAI</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Section 2: Our Analysis | |
| st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'>Our Analysis</h3>", unsafe_allow_html=True) | |
| st.markdown("<p>We've conducted a thorough analysis of token efficiency of our tokeniser against different tokenizers:</p>", unsafe_allow_html=True) | |
| # Analysis points with enhanced styling | |
| st.markdown(""" | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>The <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">GPT-2 tokenizer</span> corresponds to approximately <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">3.9 characters per token</span></div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>English text corpus typically has average word lengths ranging from <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">4.7 to 5.1 characters</span>, which was observed to be <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">4.73-4.79 in our dataset</span></div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>Thus for our dataset, traditional tokenizers convert to roughly <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">β΄ββ of a word</span> (100 tokens β 80 words)</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Section 3: tokeniser-py Efficiency | |
| st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'><u>tokeniser-py</u> efficiency</h3>", unsafe_allow_html=True) | |
| st.markdown("<p>Our tokenizer demonstrates different characteristics:</p>", unsafe_allow_html=True) | |
| # Efficiency points with enhanced styling | |
| st.markdown(""" | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>Average token size of <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px;">~2.52 characters**</span> across all token types</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>For alphanumeric tokens only: <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">~3.97 characters per token</span></div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>This translates to approximately <span style="background-color:rgba(0,186,124,0.4); padding:2px 4px; border-radius:3px;">βΉβββ of a word</span> (100 tokens β 90 words)</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>Unlike other tokenizers, we handle spaces (' ') as separate tokens rather than concatenating them with other characters, which affects our total token count</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Section 4: Real-world Comparison with completely redesigned styling | |
| st.markdown(""" | |
| <div style="background-color:#262b38; padding:20px; border-radius:5px; margin:25px 0;"> | |
| <h3 style="color:#00ba7c; margin-top:0px; margin-bottom:15px; font-size:1.3em;">Real-world Comparison</h3> | |
| <p style="margin-bottom:15px;">We tested a 28-page blog post across different tokenizers:</p> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">1</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-4o/GPT-4:</span> | |
| <span style="font-size:1.1em; margin-left:8px;">~10.4k tokens</span> | |
| </div> | |
| </div> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">2</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-3:</span> | |
| <span style="font-size:1.1em; margin-left:8px;">~12.1k tokens</span> | |
| </div> | |
| </div> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">3</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">tokeniser-py:</span> | |
| <span style="font-size:1.1em; margin-left:8px;">~18.8k tokens</span> | |
| <span style="color:#aaa;">(including ~8.4k space tokens and ~2.6k other special-char based tokens)</span> | |
| </div> | |
| </div> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">4</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">tokeniser-py (alphanumeric only):</span> | |
| <span style="font-size:1.1em; margin-left:8px;">~7.8k tokens</span> | |
| </div> | |
| </div> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">5</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">GPT-4/GPT-4o (alphanumeric):</span> | |
| <span style="font-size:1.1em; margin-left:8px;">~8k tokens</span> | |
| </div> | |
| </div> | |
| <div class="comparison-item"> | |
| <div class="comparison-icon">6</div> | |
| <div class="comparison-text"> | |
| <span style="background-color:rgba(0,186,124,0.15); padding:2px 4px; border-radius:3px; font-weight:500;">Token corpus size:</span> | |
| <span style="font-size:1.1em; margin-left:8px;">131k (tokeniser-py) vs. 100k (GPT-4 multimodal)</span> | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Note box with enhanced styling | |
| st.markdown(""" | |
| <div style="background-color:rgba(255,204,0,0.1); border-left:3px solid rgba(255,204,0,0.7); padding:15px; margin:20px 0; border-radius:0 5px 5px 0;"> | |
| <div style="font-size:18px; font-weight:bold; margin-bottom:12px; color:#ffcc00;">Note:</div> | |
| <p style="line-height:2.2;"><span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.2); color:#ffcc00;">β’</span> | |
| <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">**2.52 characters</span> is the average (adjusted frequency)-weighted token size i.e. we weigh the token size by their true occurences, obtained after adjusting their observed occurences by their super-tokens' occurences.<br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>A super-token of a token say '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">e</span>' is any token which contains '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">e</span>' (like '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">ear</span>', '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">ears</span>', '<span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">years</span>', etc.). While weighing the token length we find that a smaller tokens have an undue higher weightage due their occurences in super-tokens being added up as well. | |
| To adjust this we hierarchially subtract the occurence of a token from its super tokens to get a True frequency.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>Un-adjusted frequency weighting gives an average size of <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">~2.2 characters</span> per token, and a raw (un-weighted) average results in <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">~4.6-4.7 chars</span> per token.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>Our tokenization strategy separates non-underscore special characters from alphanumeric tokens.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>We define alphanumeric tokens as any word that doesn't contain special characters (except underscores).</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>For OpenAI's tokens, we considered any token containing at least one alphanumeric character (excluding underscores) as an alphanumeric token.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>This difference is due to the different special characters handling methodology followed in both tokeniser.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>The tokeniser's better word representation performance is not only due to technique differences but also because GPT-4 has fewer available tokens <span style="background-color:rgba(255,204,0,0.15); padding:2px 4px; border-radius:3px;">(100k vs our 131k)</span> and needs to reserve tokens for multimodal content, further reducing English-specific tokens.</span><br> | |
| <span class="bullet-point-icon" style="background-color:rgba(255,204,0,0.15); color:#ffcc00;">β’</span> | |
| <span>Additionally, GPT-4's approach of combining special characters with alphanumerical content potentially reduces the availability of relevant alphanumerical tokens. Despite these constraints, GPT-4's tokeniser performs relatively well, though ours provides a valuable research preview into an alternate algorithm.</span></p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Section 5: Design Philosophy with enhanced styling | |
| st.markdown("<h3 style='color:#00ba7c; margin-top:20px;'>Design Philosophy</h3>", unsafe_allow_html=True) | |
| st.markdown("<p>Our approach prioritizes semantic representation over token count minimization:</p>", unsafe_allow_html=True) | |
| # Philosophy points with enhanced styling | |
| st.markdown(""" | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>We consciously separate special characters from alphanumeric tokens</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>This provides more available alphanumeric tokens in the vocabulary</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>While this may increase total token count, it improves semantic representation</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>Our design philosophy favors representation quality over token count minimization</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>For example, space (' ') is broken as a separate token in our system compared to being concatenated in standard methods like OpenAI's</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>This approach results in better word representations despite potentially larger token counts</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>While choosing a combination-based tokenizer may reduce token count, our focus on representation offers semantic advantages</div> | |
| </div> | |
| <div class="bullet-point"> | |
| <div class="bullet-point-icon">β’</div> | |
| <div>Combining special tokens with alphanumeric ones adds less semantic value than using pure alphanumeric tokens</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Footer link | |
| st.markdown(""" | |
| <p style="margin-top:20px;"> | |
| Need a programmatic interface for tokenizing text? Check out our | |
| <a href="https://pypi.org/project/tokeniser-py/">tokeniser-py</a> package for Python. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Footer with additional information | |
| st.markdown("---") | |
| st.markdown("""<h2 style='color:#00ba7c; margin-top:0px;'>About tokeniser-py</h2> | |
| A high-performance, fully custom tokeniser built from scratch β no BPE, no existing NLP tokenisation scheme. | |
| This tokeniser is based on a unique algorithm developed independently and trained on over 1 billion tokens | |
| from the SlimPajama dataset (Val + Test), providing an efficient, interpretable, and extendable tokenisation pipeline. | |
| <div class="library-feature"> | |
| <div class="feature-dot">β’</div> | |
| <div class="feature-text"><strong>Tokeniser built on a vocabulary of 131,072 tokens</strong></div> | |
| </div> | |
| <div class="library-feature"> | |
| <div class="feature-dot">β’</div> | |
| <div class="feature-text"><strong>Two versions of vocab:</strong> <code>0.5B</code> (Validation-only data) and <code>1B</code> (Validation + Test data)</div> | |
| </div> | |
| <div class="library-feature"> | |
| <div class="feature-dot">β’</div> | |
| <div class="feature-text"><strong>Token vocab built via a custom algorithm</strong> β no Byte Pair Encoding (BPE)</div> | |
| </div> | |
| <div class="library-feature"> | |
| <div class="feature-dot">β’</div> | |
| <div class="feature-text"><strong>Lightweight JSON format</strong> for token maps & token count maps</div> | |
| </div> | |
| <div class="library-feature"> | |
| <div class="feature-dot">β’</div> | |
| <div class="feature-text"><strong>Ready for integration</strong> into any LLM pre-tokenisation pipeline</div> | |
| </div> | |
| [GitHub Repository](https://github.com/Tasmay-Tibrewal/tokeniser-py) | [PyPI Package](https://pypi.org/project/tokeniser-py/) | |
| """, unsafe_allow_html=True) | |
| import streamlit as st | |
| # Add explanation of the library in expandable section | |
| with st.expander("Learn more about tokeniser-py"): | |
| st.markdown(""" | |
| ### π What This Library Offers | |
| - Tokeniser built on a vocabulary of **131,072 tokens** | |
| - Two versions of vocab: | |
| - `0.5B`: Validation-only data | |
| - `1B`: Validation + Test data | |
| - Token vocab built via a **custom algorithm** β no Byte Pair Encoding (BPE) | |
| - Tokenisation logic includes: | |
| - Token lookup from pre-generated token map | |
| - Dynamic programming-based segmentation for out-of-vocab tokens | |
| - One-hot encoding (NumPy or PyTorch) | |
| - Visualisation utilities for tokens and token IDs | |
| - Lightweight JSON format for token maps & token count maps | |
| - Ready for integration into any LLM pre-tokenisation pipeline | |
| """) | |
| # Add custom CSS | |
| st.markdown(""" | |
| <style> | |
| div.stCodeBlock { | |
| background-color: #1a1c24 !important; | |
| border-radius: 10px; | |
| padding-left: 25px; | |
| padding-top: 15px; | |
| padding-bottom: 15px; | |
| } | |
| pre.language-python { | |
| background-color: #1a1c24 !important; | |
| border-radius: 10px; | |
| } | |
| .code-header { | |
| font-size: 1.5em; | |
| font-weight: bold; | |
| margin-top: 0em; | |
| margin-bottom: 0.5em; | |
| display: flex; | |
| align-items: center; | |
| } | |
| .code-block { | |
| background-color: #1a1c24; | |
| border-radius: 5px; | |
| padding: 1em; | |
| margin-bottom: 1em; | |
| font-family: 'Courier New', monospace; | |
| white-space: pre; | |
| color: #d4d4d4; | |
| overflow-x: auto; | |
| line-height: 1.5; | |
| } | |
| .keyword { color: #c586c0; } | |
| .string { color: #CE9178; } | |
| .function { color: #4ec9b0; } | |
| .parenthesis {color: #ffd700;} | |
| .var {color: #8cdcfe;} | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Code header and block with simpler HTML | |
| st.markdown(""" | |
| <div class="code-header">π οΈ Usage</div> | |
| <pre class="code-block"><span class="keyword">from</span> <span class="function">tokeniser</span> <span class="keyword">import</span> <span class="function">Tokeniser</span><br> | |
| <span class="var">t</span> = <span class="function">Tokeniser</span><span class="parenthesis">()</span><br> | |
| <span class="var">tokens</span>, <span class="var">count</span> = <span class="var">t</span>.<span class="function">tokenise</span><span class="parenthesis">(</span><span class="string">"Your input text here."</span><span class="parenthesis">)</span><br> | |
| <span class="var">token_ids</span> = <span class="var">t</span>.<span class="function">token_ids</span><span class="parenthesis">(</span><span class="var">tokens</span><span class="parenthesis">)</span></pre> | |
| """, unsafe_allow_html=True) | |
| st.markdown(""" | |
| Use `t.one_hot_tokens(token_ids)` for NumPy-based one-hot encoding, or `op='torch'` for PyTorch. | |
| ### π Vocab Files | |
| - `ordered_tokenizer_1b_val_test_data.json` β Ordered tokens (1B data) | |
| - `unordered_tokenizer_1b_val_test_data.json` β Unordered tokens (1B) | |
| - `count_tokenizer_1b_val_test_data.json` β Token counts (1B) | |
| - Similar structure for 0.5B val-only version | |
| """) |