""" Utility functions for the Inclusive World Curriculum Assistant """ import re from typing import List, Dict, Any from pathlib import Path import fitz from config import CURRICULUM_TOPICS def clean_text(text: str) -> str: """Clean and normalize text content""" # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters that might interfere with processing text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text) return text.strip() def extract_curriculum_topics(text: str) -> List[str]: """Extract relevant curriculum topics from text""" found_topics = [] text_lower = text.lower() for topic in CURRICULUM_TOPICS: topic_lower = topic.lower() if any(word in text_lower for word in topic_lower.split()): found_topics.append(topic) return found_topics def create_curriculum_summary(docs: List[Dict[str, Any]]) -> Dict[str, Any]: """Create a summary of processed curriculum documents""" summary = { "total_documents": len(docs), "total_content_length": sum(len(doc.get('content', '')) for doc in docs), "topics_covered": [], "document_types": {} } # Analyze document types for doc in docs: filename = doc.get('filename', '') if 'week' in filename.lower(): week_num = re.search(r'week\s*(\d+)', filename.lower()) if week_num: summary["document_types"][f"Week {week_num.group(1)}"] = filename # Extract common topics all_content = ' '.join([doc.get('content', '') for doc in docs]) summary["topics_covered"] = extract_curriculum_topics(all_content) return summary def validate_pdf_file(file_path: str) -> bool: """Validate if a file is a readable PDF""" try: doc = fitz.open(file_path) if doc.page_count > 0: doc.close() return True doc.close() return False except Exception: return False def get_file_info(file_path: str) -> Dict[str, Any]: """Get information about a PDF file""" try: doc = fitz.open(file_path) info = { "filename": Path(file_path).name, "page_count": doc.page_count, "file_size": Path(file_path).stat().st_size, "is_valid": True } doc.close() return info except Exception as e: return { "filename": Path(file_path).name, "error": str(e), "is_valid": False }