Spaces:
Sleeping
Sleeping
| """ | |
| Utility functions for Smart Auto-Complete | |
| Provides common functionality for text processing, logging, and validation | |
| """ | |
| import logging | |
| import re | |
| import sys | |
| from typing import Dict, List, Optional, Tuple | |
| import html | |
| import unicodedata | |
| def setup_logging(level: str = "INFO") -> logging.Logger: | |
| """ | |
| Set up logging configuration for the application | |
| Args: | |
| level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) | |
| Returns: | |
| Configured logger instance | |
| """ | |
| # Create logger | |
| logger = logging.getLogger("smart_autocomplete") | |
| logger.setLevel(getattr(logging, level.upper())) | |
| # Remove existing handlers to avoid duplicates | |
| for handler in logger.handlers[:]: | |
| logger.removeHandler(handler) | |
| # Create console handler with formatting | |
| console_handler = logging.StreamHandler(sys.stdout) | |
| console_handler.setLevel(getattr(logging, level.upper())) | |
| # Create formatter | |
| formatter = logging.Formatter( | |
| '%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| console_handler.setFormatter(formatter) | |
| # Add handler to logger | |
| logger.addHandler(console_handler) | |
| return logger | |
| def sanitize_input(text: str) -> str: | |
| """ | |
| Sanitize and clean input text for processing | |
| Args: | |
| text: Raw input text | |
| Returns: | |
| Cleaned and sanitized text | |
| """ | |
| if not text: | |
| return "" | |
| # Convert to string if not already | |
| text = str(text) | |
| # HTML escape to prevent injection | |
| text = html.escape(text) | |
| # Normalize unicode characters | |
| text = unicodedata.normalize('NFKC', text) | |
| # Remove excessive whitespace but preserve structure | |
| text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) # Max 2 consecutive newlines | |
| text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs to single space | |
| # Remove control characters except newlines and tabs | |
| text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\t') | |
| # Trim leading/trailing whitespace | |
| text = text.strip() | |
| return text | |
| def extract_context_hints(text: str) -> Dict[str, any]: | |
| """ | |
| Extract contextual hints from the input text to improve suggestions | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary containing context hints | |
| """ | |
| hints = { | |
| 'length': len(text), | |
| 'word_count': len(text.split()), | |
| 'has_greeting': False, | |
| 'has_signature': False, | |
| 'has_code_markers': False, | |
| 'has_questions': False, | |
| 'tone': 'neutral', | |
| 'language_style': 'general' | |
| } | |
| text_lower = text.lower() | |
| # Check for email patterns | |
| email_greetings = ['dear', 'hello', 'hi', 'greetings', 'good morning', 'good afternoon'] | |
| email_signatures = ['sincerely', 'best regards', 'thank you', 'yours truly', 'kind regards'] | |
| hints['has_greeting'] = any(greeting in text_lower for greeting in email_greetings) | |
| hints['has_signature'] = any(signature in text_lower for signature in email_signatures) | |
| # Check for code patterns | |
| code_markers = ['//', '/*', '*/', '#', 'def ', 'function', 'class ', 'import ', 'from '] | |
| hints['has_code_markers'] = any(marker in text_lower for marker in code_markers) | |
| # Check for questions | |
| hints['has_questions'] = '?' in text or any(q in text_lower for q in ['what', 'how', 'why', 'when', 'where', 'who']) | |
| # Determine tone | |
| formal_words = ['please', 'kindly', 'respectfully', 'sincerely', 'professional'] | |
| casual_words = ['hey', 'yeah', 'cool', 'awesome', 'thanks'] | |
| formal_count = sum(1 for word in formal_words if word in text_lower) | |
| casual_count = sum(1 for word in casual_words if word in text_lower) | |
| if formal_count > casual_count: | |
| hints['tone'] = 'formal' | |
| elif casual_count > formal_count: | |
| hints['tone'] = 'casual' | |
| # Determine language style | |
| if hints['has_code_markers']: | |
| hints['language_style'] = 'technical' | |
| elif hints['has_greeting'] or hints['has_signature']: | |
| hints['language_style'] = 'business' | |
| elif any(creative in text_lower for creative in ['once upon', 'story', 'character', 'plot']): | |
| hints['language_style'] = 'creative' | |
| return hints | |
| def validate_api_key(api_key: str, provider: str) -> bool: | |
| """ | |
| Validate API key format for different providers | |
| Args: | |
| api_key: The API key to validate | |
| provider: The provider name (openai, anthropic) | |
| Returns: | |
| True if the key format is valid, False otherwise | |
| """ | |
| if not api_key or not isinstance(api_key, str): | |
| return False | |
| api_key = api_key.strip() | |
| if provider.lower() == 'openai': | |
| # OpenAI keys start with 'sk-' and are typically 51 characters | |
| return api_key.startswith('sk-') and len(api_key) >= 40 | |
| elif provider.lower() == 'anthropic': | |
| # Anthropic keys start with 'sk-ant-' | |
| return api_key.startswith('sk-ant-') and len(api_key) >= 40 | |
| return False | |
| def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str: | |
| """ | |
| Truncate text to a maximum length while optionally preserving word boundaries | |
| Args: | |
| text: Text to truncate | |
| max_length: Maximum allowed length | |
| preserve_words: Whether to preserve word boundaries | |
| Returns: | |
| Truncated text | |
| """ | |
| if len(text) <= max_length: | |
| return text | |
| if not preserve_words: | |
| return text[:max_length].rstrip() + "..." | |
| # Find the last space before the max_length | |
| truncated = text[:max_length] | |
| last_space = truncated.rfind(' ') | |
| if last_space > max_length * 0.8: # Only use word boundary if it's not too far back | |
| return text[:last_space].rstrip() + "..." | |
| else: | |
| return text[:max_length].rstrip() + "..." | |
| def format_suggestions_for_display(suggestions: List[str], max_display_length: int = 100) -> List[Dict[str, str]]: | |
| """ | |
| Format suggestions for display in the UI | |
| Args: | |
| suggestions: List of suggestion strings | |
| max_display_length: Maximum length for display | |
| Returns: | |
| List of formatted suggestion dictionaries | |
| """ | |
| formatted = [] | |
| for i, suggestion in enumerate(suggestions, 1): | |
| # Clean the suggestion | |
| clean_suggestion = sanitize_input(suggestion) | |
| # Create display version (truncated if needed) | |
| display_text = truncate_text(clean_suggestion, max_display_length) | |
| formatted.append({ | |
| 'id': i, | |
| 'text': clean_suggestion, | |
| 'display_text': display_text, | |
| 'length': len(clean_suggestion), | |
| 'word_count': len(clean_suggestion.split()) | |
| }) | |
| return formatted | |
| def calculate_text_similarity(text1: str, text2: str) -> float: | |
| """ | |
| Calculate similarity between two texts using simple word overlap | |
| Args: | |
| text1: First text | |
| text2: Second text | |
| Returns: | |
| Similarity score between 0 and 1 | |
| """ | |
| if not text1 or not text2: | |
| return 0.0 | |
| # Convert to lowercase and split into words | |
| words1 = set(text1.lower().split()) | |
| words2 = set(text2.lower().split()) | |
| # Calculate Jaccard similarity | |
| intersection = len(words1.intersection(words2)) | |
| union = len(words1.union(words2)) | |
| return intersection / union if union > 0 else 0.0 | |
| def get_text_stats(text: str) -> Dict[str, int]: | |
| """ | |
| Get basic statistics about the text | |
| Args: | |
| text: Text to analyze | |
| Returns: | |
| Dictionary with text statistics | |
| """ | |
| if not text: | |
| return {'characters': 0, 'words': 0, 'sentences': 0, 'paragraphs': 0} | |
| # Count characters (excluding whitespace) | |
| char_count = len(text.replace(' ', '').replace('\n', '').replace('\t', '')) | |
| # Count words | |
| word_count = len(text.split()) | |
| # Count sentences (rough estimate) | |
| sentence_count = len(re.findall(r'[.!?]+', text)) | |
| # Count paragraphs | |
| paragraph_count = len([p for p in text.split('\n\n') if p.strip()]) | |
| return { | |
| 'characters': char_count, | |
| 'words': word_count, | |
| 'sentences': max(1, sentence_count), # At least 1 sentence | |
| 'paragraphs': max(1, paragraph_count) # At least 1 paragraph | |
| } | |