JatinAutonomousLabs's picture
Upload 4 files
9d08bab verified
raw
history blame contribute delete
879 Bytes
#!/usr/bin/env python3
"""Text Processing Plugin"""
import re
from typing import List
class TextProcessor:
"""Clean and process text data."""
def clean_text(self, text: str) -> str:
"""Remove extra whitespace, special chars."""
text = re.sub(r'\s+', ' ', text) # Multiple spaces to one
text = text.strip()
return text
def extract_emails(self, text: str) -> List[str]:
"""Extract email addresses from text."""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return re.findall(pattern, text)
def extract_urls(self, text: str) -> List[str]:
"""Extract URLs from text."""
pattern = r'https?://[^\s]+'
return re.findall(pattern, text)
def tokenize(self, text: str) -> List[str]:
"""Simple word tokenization."""
return text.lower().split()