KLypse / app /services /processing.py
DEVJHAWAR11
Deploy Klypse backend
54bef2f
# app/services/processing.py
import re
def clean_text(text: str) -> str:
"""
Clean transcript text by removing:
- Timestamp markers like {ts:123}
- Extra whitespace, line breaks
- Special characters and formatting artifacts
- Music/sound effect markers like [संगीत], [Music]
"""
if not text:
return ""
# Remove timestamp markers: {ts:123}, {ts:0}, etc.
text = re.sub(r'\{ts:\d+\}', '', text)
# Remove sound effect markers: [संगीत], [Music], [Applause], etc.
text = re.sub(r'\[.*?\]', '', text)
# Remove parentheses with metadata: (music), (laughing), etc.
text = re.sub(r'\(.*?\)', '', text)
# Remove URLs
text = re.sub(r'http[s]?://\S+', '', text)
# Replace multiple line breaks with space
text = text.replace('\n', ' ')
# Remove extra whitespace (multiple spaces to single space)
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
return text
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""
Split text into chunks with overlap for better context preservation.
Args:
text: Cleaned text to chunk
chunk_size: Number of words per chunk (default: 500)
overlap: Number of overlapping words between chunks (default: 50)
Returns:
List of text chunks with overlap
"""
if not text:
return []
words = text.split()
# If text is smaller than chunk_size, return as single chunk
if len(words) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(words):
# Get chunk of words
end = start + chunk_size
chunk = " ".join(words[start:end])
chunks.append(chunk)
# Move start position with overlap
start = end - overlap
# Prevent infinite loop if we're at the end
if end >= len(words):
break
return chunks