Spaces:

getGO007
/

croakenizer

Sleeping

App Files Files Community

croakenizer / app.py

getGO007

Update app.py

8fe0ddf verified 3 months ago

raw

history blame contribute delete

3.61 kB

	import streamlit as st
	import re

	#############################################
	# 1) DEFINE YOUR MINIMAL TOKENIZER CLASSES #
	#############################################
	class Token:
	"""Represents a single token with a type, value, and position."""
	def __init__(self, token_type, value, position=None):
	self.type = token_type
	self.value = value
	self.position = position

	def __repr__(self):
	return f"Token(type='{self.type}', value='{self.value}', position={self.position})"


	class Tokenizer:
	"""A simple tokenizer for WORD, NUMBER, and SPACE."""

	token_specifications = [
	('NUMBER', r'\d+'),
	('WORD', r'[A-Za-z]+'),
	('SPACE', r'\s+'),
	('PUNCT', r'[^\w\s]'), # <--- Added punctuation pattern
	]

	combined_pattern = '\|'.join(
	f'(?P<{name}>{pattern})' for (name, pattern) in token_specifications
	)

	def __init__(self, text):
	self.text = text
	self.regex = re.compile(self.combined_pattern)

	def tokenize(self):
	tokens = []
	for match in self.regex.finditer(self.text):
	token_type = match.lastgroup
	token_value = match.group(token_type)
	position = match.start()

	# Ignore spaces
	if token_type == 'SPACE':
	continue

	tokens.append(Token(token_type, token_value, position))
	return tokens

	#############################################
	# 2) STREAMLIT APP LAYOUT & FUNCTIONALITY #
	#############################################

	# Inject custom CSS for blinking boxes, unique coloring, etc.
	st.markdown(
	"""
	<style>
	/* Define a blinking animation */
	@keyframes blink {
	0% { background-color: white; }
	50% { background-color: lightgreen; }
	100% { background-color: white; }
	}

	.blink-box {
	display: inline-block;
	border: 1px solid #ccc;
	padding: 5px 10px;
	margin: 5px;
	border-radius: 5px;
	animation: blink 2s infinite;
	/* We'll color the text in Python code itself or pass color via style */
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	st.title("My Tokenizer App")

	# Let the user enter text
	user_input = st.text_input("Enter your text:", "Hello world 123!")

	# When the user clicks the button, we run the tokenizer
	if st.button("Tokenize"):
	tokenizer = Tokenizer(user_input)
	tokens = tokenizer.tokenize()

	# Show the tokens in blinking boxes, each in a unique color
	# For a simple approach, define a list of colors we can cycle through
	color_list = ["blue", "red", "orange", "purple", "green", "teal", "magenta"]

	# We'll display them horizontally (inline)
	for i, tok in enumerate(tokens):
	color = color_list[i % len(color_list)] # pick color in a round-robin style

	# HTML for the blinking box with color, token value on top
	st.markdown(
	f"""
	<div class="blink-box" style="color:{color};">
	<strong>{tok.value}</strong>
	</div>
	""",
	unsafe_allow_html=True
	)

	# Add a small separator
	st.write("---")

	# Now, for each token, display details below
	# We can do it as separate st.markdown or st.write lines
	for i, tok in enumerate(tokens):
	st.subheader(f"Token {i+1}: {tok.value}")
	st.write(f"Type: {tok.type}")
	st.write(f"Value: {tok.value}")
	st.write(f"Position: {tok.position}")
	st.write("---")