Spaces:

Iagoaladin
/

Smart-Auto-Complete

Sleeping

Sandipan Haldar

smart autocompletions app

1492cf4 7 months ago

8.52 kB

	"""
	Utility functions for Smart Auto-Complete
	Provides common functionality for text processing, logging, and validation
	"""

	import logging
	import re
	import sys
	from typing import Dict, List, Optional, Tuple
	import html
	import unicodedata


	def setup_logging(level: str = "INFO") -> logging.Logger:
	"""
	Set up logging configuration for the application

	Args:
	level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)

	Returns:
	Configured logger instance
	"""
	# Create logger
	logger = logging.getLogger("smart_autocomplete")
	logger.setLevel(getattr(logging, level.upper()))

	# Remove existing handlers to avoid duplicates
	for handler in logger.handlers[:]:
	logger.removeHandler(handler)

	# Create console handler with formatting
	console_handler = logging.StreamHandler(sys.stdout)
	console_handler.setLevel(getattr(logging, level.upper()))

	# Create formatter
	formatter = logging.Formatter(
	'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)
	console_handler.setFormatter(formatter)

	# Add handler to logger
	logger.addHandler(console_handler)

	return logger


	def sanitize_input(text: str) -> str:
	"""
	Sanitize and clean input text for processing

	Args:
	text: Raw input text

	Returns:
	Cleaned and sanitized text
	"""
	if not text:
	return ""

	# Convert to string if not already
	text = str(text)

	# HTML escape to prevent injection
	text = html.escape(text)

	# Normalize unicode characters
	text = unicodedata.normalize('NFKC', text)

	# Remove excessive whitespace but preserve structure
	text = re.sub(r'\n\s\n\s\n', '\n\n', text) # Max 2 consecutive newlines
	text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs to single space

	# Remove control characters except newlines and tabs
	text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\t')

	# Trim leading/trailing whitespace
	text = text.strip()

	return text


	def extract_context_hints(text: str) -> Dict[str, any]:
	"""
	Extract contextual hints from the input text to improve suggestions

	Args:
	text: Input text to analyze

	Returns:
	Dictionary containing context hints
	"""
	hints = {
	'length': len(text),
	'word_count': len(text.split()),
	'has_greeting': False,
	'has_signature': False,
	'has_code_markers': False,
	'has_questions': False,
	'tone': 'neutral',
	'language_style': 'general'
	}

	text_lower = text.lower()

	# Check for email patterns
	email_greetings = ['dear', 'hello', 'hi', 'greetings', 'good morning', 'good afternoon']
	email_signatures = ['sincerely', 'best regards', 'thank you', 'yours truly', 'kind regards']

	hints['has_greeting'] = any(greeting in text_lower for greeting in email_greetings)
	hints['has_signature'] = any(signature in text_lower for signature in email_signatures)

	# Check for code patterns
	code_markers = ['//', '/', '/', '#', 'def ', 'function', 'class ', 'import ', 'from ']
	hints['has_code_markers'] = any(marker in text_lower for marker in code_markers)

	# Check for questions
	hints['has_questions'] = '?' in text or any(q in text_lower for q in ['what', 'how', 'why', 'when', 'where', 'who'])

	# Determine tone
	formal_words = ['please', 'kindly', 'respectfully', 'sincerely', 'professional']
	casual_words = ['hey', 'yeah', 'cool', 'awesome', 'thanks']

	formal_count = sum(1 for word in formal_words if word in text_lower)
	casual_count = sum(1 for word in casual_words if word in text_lower)

	if formal_count > casual_count:
	hints['tone'] = 'formal'
	elif casual_count > formal_count:
	hints['tone'] = 'casual'

	# Determine language style
	if hints['has_code_markers']:
	hints['language_style'] = 'technical'
	elif hints['has_greeting'] or hints['has_signature']:
	hints['language_style'] = 'business'
	elif any(creative in text_lower for creative in ['once upon', 'story', 'character', 'plot']):
	hints['language_style'] = 'creative'

	return hints


	def validate_api_key(api_key: str, provider: str) -> bool:
	"""
	Validate API key format for different providers

	Args:
	api_key: The API key to validate
	provider: The provider name (openai, anthropic)

	Returns:
	True if the key format is valid, False otherwise
	"""
	if not api_key or not isinstance(api_key, str):
	return False

	api_key = api_key.strip()

	if provider.lower() == 'openai':
	# OpenAI keys start with 'sk-' and are typically 51 characters
	return api_key.startswith('sk-') and len(api_key) >= 40
	elif provider.lower() == 'anthropic':
	# Anthropic keys start with 'sk-ant-'
	return api_key.startswith('sk-ant-') and len(api_key) >= 40

	return False


	def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str:
	"""
	Truncate text to a maximum length while optionally preserving word boundaries

	Args:
	text: Text to truncate
	max_length: Maximum allowed length
	preserve_words: Whether to preserve word boundaries

	Returns:
	Truncated text
	"""
	if len(text) <= max_length:
	return text

	if not preserve_words:
	return text[:max_length].rstrip() + "..."

	# Find the last space before the max_length
	truncated = text[:max_length]
	last_space = truncated.rfind(' ')

	if last_space > max_length * 0.8: # Only use word boundary if it's not too far back
	return text[:last_space].rstrip() + "..."
	else:
	return text[:max_length].rstrip() + "..."


	def format_suggestions_for_display(suggestions: List[str], max_display_length: int = 100) -> List[Dict[str, str]]:
	"""
	Format suggestions for display in the UI

	Args:
	suggestions: List of suggestion strings
	max_display_length: Maximum length for display

	Returns:
	List of formatted suggestion dictionaries
	"""
	formatted = []

	for i, suggestion in enumerate(suggestions, 1):
	# Clean the suggestion
	clean_suggestion = sanitize_input(suggestion)

	# Create display version (truncated if needed)
	display_text = truncate_text(clean_suggestion, max_display_length)

	formatted.append({
	'id': i,
	'text': clean_suggestion,
	'display_text': display_text,
	'length': len(clean_suggestion),
	'word_count': len(clean_suggestion.split())
	})

	return formatted


	def calculate_text_similarity(text1: str, text2: str) -> float:
	"""
	Calculate similarity between two texts using simple word overlap

	Args:
	text1: First text
	text2: Second text

	Returns:
	Similarity score between 0 and 1
	"""
	if not text1 or not text2:
	return 0.0

	# Convert to lowercase and split into words
	words1 = set(text1.lower().split())
	words2 = set(text2.lower().split())

	# Calculate Jaccard similarity
	intersection = len(words1.intersection(words2))
	union = len(words1.union(words2))

	return intersection / union if union > 0 else 0.0


	def get_text_stats(text: str) -> Dict[str, int]:
	"""
	Get basic statistics about the text

	Args:
	text: Text to analyze

	Returns:
	Dictionary with text statistics
	"""
	if not text:
	return {'characters': 0, 'words': 0, 'sentences': 0, 'paragraphs': 0}

	# Count characters (excluding whitespace)
	char_count = len(text.replace(' ', '').replace('\n', '').replace('\t', ''))

	# Count words
	word_count = len(text.split())

	# Count sentences (rough estimate)
	sentence_count = len(re.findall(r'[.!?]+', text))

	# Count paragraphs
	paragraph_count = len([p for p in text.split('\n\n') if p.strip()])

	return {
	'characters': char_count,
	'words': word_count,
	'sentences': max(1, sentence_count), # At least 1 sentence
	'paragraphs': max(1, paragraph_count) # At least 1 paragraph
	}