Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import re | |
| ############################################# | |
| # 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES # | |
| ############################################# | |
| class Token: | |
| """Represents a single token with a type, value, and position.""" | |
| def __init__(self, token_type, value, position=None): | |
| self.type = token_type | |
| self.value = value | |
| self.position = position | |
| def __repr__(self): | |
| return f"Token(type='{self.type}', value='{self.value}', position={self.position})" | |
| class Tokenizer: | |
| """A simple tokenizer for WORD, NUMBER, and SPACE.""" | |
| token_specifications = [ | |
| ('NUMBER', r'\d+'), | |
| ('WORD', r'[A-Za-z]+'), | |
| ('SPACE', r'\s+'), | |
| ('PUNCT', r'[^\w\s]'), # <--- Added punctuation pattern | |
| ] | |
| combined_pattern = '|'.join( | |
| f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications | |
| ) | |
| def __init__(self, text): | |
| self.text = text | |
| self.regex = re.compile(self.combined_pattern) | |
| def tokenize(self): | |
| tokens = [] | |
| for match in self.regex.finditer(self.text): | |
| token_type = match.lastgroup | |
| token_value = match.group(token_type) | |
| position = match.start() | |
| # Ignore spaces | |
| if token_type == 'SPACE': | |
| continue | |
| tokens.append(Token(token_type, token_value, position)) | |
| return tokens | |
| ############################################# | |
| # 2) STREAMLIT APP LAYOUT & FUNCTIONALITY # | |
| ############################################# | |
| # Inject custom CSS for blinking boxes, unique coloring, etc. | |
| st.markdown( | |
| """ | |
| <style> | |
| /* Define a blinking animation */ | |
| @keyframes blink { | |
| 0% { background-color: white; } | |
| 50% { background-color: lightgreen; } | |
| 100% { background-color: white; } | |
| } | |
| .blink-box { | |
| display: inline-block; | |
| border: 1px solid #ccc; | |
| padding: 5px 10px; | |
| margin: 5px; | |
| border-radius: 5px; | |
| animation: blink 2s infinite; | |
| /* We'll color the text in Python code itself or pass color via style */ | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| st.title("My Tokenizer App") | |
| # Let the user enter text | |
| user_input = st.text_input("Enter your text:", "Hello world 123!") | |
| # When the user clicks the button, we run the tokenizer | |
| if st.button("Tokenize"): | |
| tokenizer = Tokenizer(user_input) | |
| tokens = tokenizer.tokenize() | |
| # Show the tokens in blinking boxes, each in a unique color | |
| # For a simple approach, define a list of colors we can cycle through | |
| color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"] | |
| # We'll display them horizontally (inline) | |
| for i, tok in enumerate(tokens): | |
| color = color_list[i % len(color_list)] # pick color in a round-robin style | |
| # HTML for the blinking box with color, token value on top | |
| st.markdown( | |
| f""" | |
| <div class="blink-box" style="color:{color};"> | |
| <strong>{tok.value}</strong> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # Add a small separator | |
| st.write("---") | |
| # Now, for each token, display details below | |
| # We can do it as separate st.markdown or st.write lines | |
| for i, tok in enumerate(tokens): | |
| st.subheader(f"Token {i+1}: {tok.value}") | |
| st.write(f"**Type:** {tok.type}") | |
| st.write(f"**Value:** {tok.value}") | |
| st.write(f"**Position:** {tok.position}") | |
| st.write("---") | |