croakenizer / app.py
getGO007's picture
Update app.py
8fe0ddf verified
import streamlit as st
import re
#############################################
# 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES #
#############################################
class Token:
"""Represents a single token with a type, value, and position."""
def __init__(self, token_type, value, position=None):
self.type = token_type
self.value = value
self.position = position
def __repr__(self):
return f"Token(type='{self.type}', value='{self.value}', position={self.position})"
class Tokenizer:
"""A simple tokenizer for WORD, NUMBER, and SPACE."""
token_specifications = [
('NUMBER', r'\d+'),
('WORD', r'[A-Za-z]+'),
('SPACE', r'\s+'),
('PUNCT', r'[^\w\s]'), # <--- Added punctuation pattern
]
combined_pattern = '|'.join(
f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications
)
def __init__(self, text):
self.text = text
self.regex = re.compile(self.combined_pattern)
def tokenize(self):
tokens = []
for match in self.regex.finditer(self.text):
token_type = match.lastgroup
token_value = match.group(token_type)
position = match.start()
# Ignore spaces
if token_type == 'SPACE':
continue
tokens.append(Token(token_type, token_value, position))
return tokens
#############################################
# 2) STREAMLIT APP LAYOUT & FUNCTIONALITY #
#############################################
# Inject custom CSS for blinking boxes, unique coloring, etc.
st.markdown(
"""
<style>
/* Define a blinking animation */
@keyframes blink {
0% { background-color: white; }
50% { background-color: lightgreen; }
100% { background-color: white; }
}
.blink-box {
display: inline-block;
border: 1px solid #ccc;
padding: 5px 10px;
margin: 5px;
border-radius: 5px;
animation: blink 2s infinite;
/* We'll color the text in Python code itself or pass color via style */
}
</style>
""",
unsafe_allow_html=True
)
st.title("My Tokenizer App")
# Let the user enter text
user_input = st.text_input("Enter your text:", "Hello world 123!")
# When the user clicks the button, we run the tokenizer
if st.button("Tokenize"):
tokenizer = Tokenizer(user_input)
tokens = tokenizer.tokenize()
# Show the tokens in blinking boxes, each in a unique color
# For a simple approach, define a list of colors we can cycle through
color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"]
# We'll display them horizontally (inline)
for i, tok in enumerate(tokens):
color = color_list[i % len(color_list)] # pick color in a round-robin style
# HTML for the blinking box with color, token value on top
st.markdown(
f"""
<div class="blink-box" style="color:{color};">
<strong>{tok.value}</strong>
</div>
""",
unsafe_allow_html=True
)
# Add a small separator
st.write("---")
# Now, for each token, display details below
# We can do it as separate st.markdown or st.write lines
for i, tok in enumerate(tokens):
st.subheader(f"Token {i+1}: {tok.value}")
st.write(f"**Type:** {tok.type}")
st.write(f"**Value:** {tok.value}")
st.write(f"**Position:** {tok.position}")
st.write("---")