""" String processing pipeline functions for testing function analysis. """ import re from typing import List def normalize_whitespace(text): """Normalize whitespace by removing extra spaces and newlines.""" # Replace multiple whitespace with single space text = re.sub(r'\s+', ' ', text) # Strip leading and trailing whitespace return text.strip() def remove_special_characters(text, keep_chars=""): """Remove special characters, optionally keeping specified characters.""" # Keep alphanumeric, spaces, and specified characters pattern = fr"[^a-zA-Z0-9\s{re.escape(keep_chars)}]" return re.sub(pattern, '', text) def convert_to_lowercase(text): """Convert text to lowercase.""" return text.lower() def remove_stopwords(text, stopwords=None): """Remove common stopwords from text.""" if stopwords is None: stopwords = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must' } words = text.split() filtered_words = [word for word in words if word.lower() not in stopwords] return ' '.join(filtered_words) def extract_keywords(text, min_length=3): """Extract keywords (words longer than min_length).""" words = text.split() keywords = [word for word in words if len(word) >= min_length] return keywords def count_word_frequency(text): """Count frequency of each word in text.""" words = text.split() frequency = {} for word in words: frequency[word] = frequency.get(word, 0) + 1 return frequency def capitalize_words(text, exceptions=None): """Capitalize first letter of each word, with exceptions.""" if exceptions is None: exceptions = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} words = text.split() capitalized = [] for i, word in enumerate(words): if i == 0 or word.lower() not in exceptions: capitalized.append(word.capitalize()) else: capitalized.append(word.lower()) return ' '.join(capitalized) def truncate_text(text, max_length=100, suffix="..."): """Truncate text to specified length with suffix.""" if len(text) <= max_length: return text truncated = text[:max_length - len(suffix)] # Try to break at last complete word last_space = truncated.rfind(' ') if last_space > max_length * 0.8: # If we can break at a word boundary truncated = truncated[:last_space] return truncated + suffix def text_processing_pipeline(text, operations=None): """Process text through a pipeline of operations.""" if operations is None: operations = [ 'normalize_whitespace', 'remove_special_characters', 'convert_to_lowercase', 'remove_stopwords' ] # Map operation names to functions operation_map = { 'normalize_whitespace': normalize_whitespace, 'remove_special_characters': remove_special_characters, 'convert_to_lowercase': convert_to_lowercase, 'remove_stopwords': remove_stopwords, 'capitalize_words': capitalize_words, 'truncate_text': truncate_text } result = text processing_steps = [] for operation in operations: if operation in operation_map: before = result result = operation_map[operation](result) processing_steps.append({ 'operation': operation, 'before': before[:50] + "..." if len(before) > 50 else before, 'after': result[:50] + "..." if len(result) > 50 else result }) return result, processing_steps def analyze_text_statistics(text): """Analyze various statistics about the text.""" words = text.split() stats = { 'character_count': len(text), 'word_count': len(words), 'sentence_count': len(re.findall(r'[.!?]+', text)), 'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, 'longest_word': max(words, key=len) if words else "", 'shortest_word': min(words, key=len) if words else "" } return stats if __name__ == "__main__": sample_text = """ This is a SAMPLE text with various formatting issues!!! It has multiple spaces, special @#$% characters, and needs some serious cleaning & processing... """ print("Original text:") print(repr(sample_text)) processed_text, steps = text_processing_pipeline(sample_text) print("\nProcessing steps:") for step in steps: print(f"After {step['operation']}:") print(f" {step['after']}") print(f"\nFinal result: {processed_text}") stats = analyze_text_statistics(processed_text) print(f"\nText statistics: {stats}")