Spaces:
Sleeping
Sleeping
import streamlit as st | |
import re | |
############################################# | |
# 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES # | |
############################################# | |
class Token: | |
"""Represents a single token with a type, value, and position.""" | |
def __init__(self, token_type, value, position=None): | |
self.type = token_type | |
self.value = value | |
self.position = position | |
def __repr__(self): | |
return f"Token(type='{self.type}', value='{self.value}', position={self.position})" | |
class Tokenizer: | |
"""A simple tokenizer for WORD, NUMBER, and SPACE.""" | |
token_specifications = [ | |
('NUMBER', r'\d+'), | |
('WORD', r'[A-Za-z]+'), | |
('SPACE', r'\s+'), | |
('PUNCT', r'[^\w\s]'), # <--- Added punctuation pattern | |
] | |
combined_pattern = '|'.join( | |
f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications | |
) | |
def __init__(self, text): | |
self.text = text | |
self.regex = re.compile(self.combined_pattern) | |
def tokenize(self): | |
tokens = [] | |
for match in self.regex.finditer(self.text): | |
token_type = match.lastgroup | |
token_value = match.group(token_type) | |
position = match.start() | |
# Ignore spaces | |
if token_type == 'SPACE': | |
continue | |
tokens.append(Token(token_type, token_value, position)) | |
return tokens | |
############################################# | |
# 2) STREAMLIT APP LAYOUT & FUNCTIONALITY # | |
############################################# | |
# Inject custom CSS for blinking boxes, unique coloring, etc. | |
st.markdown( | |
""" | |
<style> | |
/* Define a blinking animation */ | |
@keyframes blink { | |
0% { background-color: white; } | |
50% { background-color: lightgreen; } | |
100% { background-color: white; } | |
} | |
.blink-box { | |
display: inline-block; | |
border: 1px solid #ccc; | |
padding: 5px 10px; | |
margin: 5px; | |
border-radius: 5px; | |
animation: blink 2s infinite; | |
/* We'll color the text in Python code itself or pass color via style */ | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.title("My Tokenizer App") | |
# Let the user enter text | |
user_input = st.text_input("Enter your text:", "Hello world 123!") | |
# When the user clicks the button, we run the tokenizer | |
if st.button("Tokenize"): | |
tokenizer = Tokenizer(user_input) | |
tokens = tokenizer.tokenize() | |
# Show the tokens in blinking boxes, each in a unique color | |
# For a simple approach, define a list of colors we can cycle through | |
color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"] | |
# We'll display them horizontally (inline) | |
for i, tok in enumerate(tokens): | |
color = color_list[i % len(color_list)] # pick color in a round-robin style | |
# HTML for the blinking box with color, token value on top | |
st.markdown( | |
f""" | |
<div class="blink-box" style="color:{color};"> | |
<strong>{tok.value}</strong> | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
# Add a small separator | |
st.write("---") | |
# Now, for each token, display details below | |
# We can do it as separate st.markdown or st.write lines | |
for i, tok in enumerate(tokens): | |
st.subheader(f"Token {i+1}: {tok.value}") | |
st.write(f"**Type:** {tok.type}") | |
st.write(f"**Value:** {tok.value}") | |
st.write(f"**Position:** {tok.position}") | |
st.write("---") | |