Spaces:
Sleeping
Sleeping
File size: 8,124 Bytes
aa4d076 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# chunker_app.py
import streamlit as st
import math
# --- Page Configuration ---
st.set_page_config(
page_title="Text Chunker Demo",
page_icon="π",
layout="wide",
initial_sidebar_state="expanded"
)
# --- Styling (Optional: Add some CSS for a fancier look) ---
st.markdown("""
<style>
/* Improve readability and spacing */
.stTextArea textarea {
font-family: monospace; /* Consistent font for text areas */
line-height: 1.5;
}
.stExpander {
border: 1px solid #e6eaf1; /* Subtle border for expanders */
border-radius: 0.5rem;
margin-bottom: 10px; /* Space between chunks */
background-color: #f8f9fa; /* Slightly different background for chunks */
}
.stExpander header {
font-weight: bold;
color: #262730; /* Darker header text */
}
.stExpander div[data-testid="stExpanderDetails"] {
padding: 15px; /* Padding inside the chunk view */
}
/* Sidebar styling */
[data-testid="stSidebar"] {
background-color: #f0f2f6; /* Light grey sidebar */
}
[data-testid="stSidebar"] h1 {
font-size: 1.5em; /* Smaller sidebar header */
color: #007bff; /* Blue sidebar title */
}
/* Button styling (if any buttons were used) */
/* .stButton>button { ... } */
</style>
""", unsafe_allow_html=True)
# --- Core Chunking Function ---
def chunk_text(text, chunk_size, chunk_overlap):
"""
Splits the text into chunks of a specified size with a given overlap.
Args:
text (str): The input text to chunk.
chunk_size (int): The desired maximum size of each chunk (in characters).
chunk_overlap (int): The number of characters to overlap between consecutive chunks.
Returns:
list[str]: A list of text chunks.
"""
if chunk_size <= 0:
st.error("Chunk Size must be a positive integer.")
return []
if chunk_overlap < 0:
st.error("Chunk Overlap cannot be negative.")
return []
if chunk_overlap >= chunk_size:
st.error("Chunk Overlap must be smaller than Chunk Size to prevent infinite loops or empty chunks.")
return []
if not text:
return []
chunks = []
start_index = 0
text_length = len(text)
while start_index < text_length:
end_index = start_index + chunk_size
# Slice the text to get the current chunk
chunk = text[start_index:end_index]
chunks.append(chunk)
# Move the start index for the next chunk
next_start_index = start_index + chunk_size - chunk_overlap
# If the next start index is the same as the current one (e.g., overlap == size)
# and we haven't reached the end, increment by 1 to avoid infinite loop.
# This case is technically prevented by the overlap < chunk_size check above,
# but good to be robust.
if next_start_index <= start_index and start_index < text_length:
# This scenario should not happen with valid inputs due to the check above.
# If it somehow does, break to prevent potential infinite loop.
st.warning(f"Potential loop detected. Breaking chunking at chunk {len(chunks)}.")
break
# Or force progress: next_start_index = start_index + 1
start_index = next_start_index
# Optimization: If the overlap is so large that the next start is past the text end,
# and we already captured the last part, we can break early.
# The `while start_index < text_length` condition handles this naturally.
return chunks
# --- Example Text ---
EXAMPLE_TEXT = """Streamlit is an open-source Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science. In just a few minutes you can build and deploy powerful data apps.
Let's consider the process of chunking text. This is a common technique in Natural Language Processing (NLP), especially when dealing with large documents that need to be fed into models with fixed input size limits, like many transformer models (e.g., BERT, GPT).
Chunking involves breaking down a large piece of text into smaller, manageable segments or 'chunks'. The size of these chunks is a critical parameter. Another important parameter is the 'overlap'. Overlap means that consecutive chunks will share some amount of text. This is useful to ensure that semantic context is not lost at the boundaries of chunks. For example, if a sentence is split exactly between two chunks, having an overlap allows the model processing the chunks to see the full sentence eventually, potentially across two adjacent chunks.
Choosing the right chunk size and overlap depends heavily on the specific application, the model being used, and the nature of the text. Smaller chunks capture finer details but might lose broader context. Larger chunks retain more context but might exceed model limits or smooth over important local information. Overlap helps mitigate context loss at boundaries but increases the total number of chunks and computational overhead. Experimentation is often required to find the optimal settings.
"""
# --- Streamlit App Layout ---
# Sidebar for Settings
with st.sidebar:
st.markdown("<h1>βοΈ Chunking Settings</h1>", unsafe_allow_html=True)
st.markdown("Configure how the text should be split.")
chunk_size = st.number_input(
"Chunk Size (characters)",
min_value=1,
value=250,
step=50,
help="Maximum number of characters per chunk."
)
chunk_overlap = st.number_input(
"Overlap (characters)",
min_value=0,
# Ensure initial overlap is less than initial size
value=min(50, chunk_size - 1 if chunk_size > 1 else 0),
step=10,
help="Number of characters shared between consecutive chunks. Must be less than Chunk Size."
)
# Dynamic check for overlap vs size
if chunk_overlap >= chunk_size and chunk_size > 0:
st.warning("Overlap should be smaller than Chunk Size.")
st.markdown("---")
st.markdown("Built with [Streamlit](https://streamlit.io)")
# Main content area
st.title("π Text Chunking Demonstrator")
st.markdown(
"Enter text and adjust the settings in the sidebar to see how it's divided into chunks."
)
st.divider()
# Input Text Area
st.subheader("π Input Text")
input_text = st.text_area(
"Paste your text here or use the example:",
value=EXAMPLE_TEXT,
height=300,
label_visibility="collapsed"
)
st.divider()
# Display Chunks
st.subheader(f"π§© Generated Chunks (Size: {chunk_size}, Overlap: {chunk_overlap})")
if input_text:
# Perform chunking (function includes input validation)
generated_chunks = chunk_text(input_text, chunk_size, chunk_overlap)
if generated_chunks:
st.info(f"Successfully generated **{len(generated_chunks)}** chunks.")
# Display each chunk in an expander
for i, chunk in enumerate(generated_chunks):
expander_title = f"Chunk {i+1} (Length: {len(chunk)})"
# Highlight overlapping sections visually (simple approach)
overlap_indicator = ""
if chunk_overlap > 0 and i > 0:
overlap_indicator = f" (Overlaps previous by {chunk_overlap} chars)"
with st.expander(expander_title + overlap_indicator):
# Display the chunk content using st.text or st.markdown
# Use st.text for plain text representation which respects whitespace
st.text(chunk)
# # Alternative: use st.markdown if you want potential markdown rendering
# st.markdown(f"```\n{chunk}\n```")
elif chunk_size > 0 and chunk_overlap >= 0 and chunk_overlap < chunk_size:
# Only show this if no error occurred and text was provided
st.warning("No chunks were generated. The input text might be empty or shorter than the chunk size with zero overlap.")
else:
st.info("Please enter some text in the input area above to generate chunks.") |