""" Advanced Academic Text Humanizer with State-of-the-Art ML Models This module provides cutting-edge text transformation capabilities using the latest ML models for superior AI text humanization, including T5 paraphrasing, advanced sentence transformers, and AI detection avoidance techniques. """ import ssl import random import warnings import re import logging import math from typing import List, Dict, Tuple, Optional, Union from dataclasses import dataclass from functools import lru_cache import nltk import spacy import torch import numpy as np from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import wordnet, stopwords from sentence_transformers import SentenceTransformer, util from transformers import ( T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer, pipeline, AutoTokenizer, AutoModelForCausalLM ) # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Suppress warnings warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) # Global models NLP_GLOBAL = None DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu") # Latest state-of-the-art models configuration LATEST_MODELS = { 'sentence_transformers': { 'premium': 'sentence-transformers/all-MiniLM-L12-v2', # Lighter premium option 'balanced': 'sentence-transformers/all-MiniLM-L6-v2', # Fast and reliable 'fast': 'sentence-transformers/all-MiniLM-L6-v2' # Same as balanced for consistency }, 'paraphrasing': { 'premium': 'google-t5/t5-base', # Much lighter than UL2 'balanced': 'google-t5/t5-small', # Good balance 'fast': 'google-t5/t5-small' # Fast and efficient }, 'text_generation': { 'premium': 'google-t5/t5-base', # Much lighter than 70B models 'balanced': 'google-t5/t5-small', # Small and fast 'fast': 'google-t5/t5-small' # Consistent with balanced } } def initialize_nlp(): """Initialize the global NLP model with enhanced capabilities.""" global NLP_GLOBAL if NLP_GLOBAL is None: try: NLP_GLOBAL = spacy.load("en_core_web_sm") logger.info("Successfully loaded spaCy model") except Exception as e: logger.error(f"Failed to load spaCy model: {e}") raise # Initialize on import try: initialize_nlp() except Exception as e: logger.warning(f"Could not initialize NLP model: {e}") @dataclass class TextSegment: """Enhanced text segment with additional metadata.""" content: str segment_type: str # 'text', 'markdown', 'code', 'list', 'header' line_number: int preserve_formatting: bool = False perplexity_score: float = 0.0 ai_probability: float = 0.0 class AdvancedMarkdownPreserver: """Enhanced markdown preservation with better pattern recognition.""" def __init__(self): self.patterns = { 'code_block': re.compile(r'```[\s\S]*?```', re.MULTILINE), 'inline_code': re.compile(r'`[^`]+`'), 'header': re.compile(r'^#{1,6}\s+.*$', re.MULTILINE), 'list_item': re.compile(r'^\s*[-*+]\s+.*$', re.MULTILINE), 'numbered_list': re.compile(r'^\s*\d+\.\s+.*$', re.MULTILINE), 'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'), 'bold': re.compile(r'\*\*([^*]+)\*\*'), 'italic': re.compile(r'\*([^*]+)\*'), 'blockquote': re.compile(r'^>\s+.*$', re.MULTILINE), 'horizontal_rule': re.compile(r'^---+$', re.MULTILINE), 'table_row': re.compile(r'^\s*\|.*\|\s*$', re.MULTILINE), 'latex_math': re.compile(r'\$\$.*?\$\$|\$.*?\$', re.DOTALL), 'footnote': re.compile(r'\[\^[^\]]+\]'), } def segment_text(self, text: str) -> List[TextSegment]: """Segment text with enhanced analysis.""" segments = [] lines = text.split('\n') for i, line in enumerate(lines): segment_type = self._identify_line_type(line) preserve = segment_type != 'text' # Calculate perplexity and AI probability for text segments perplexity = self._calculate_perplexity(line) if segment_type == 'text' else 0.0 ai_prob = self._calculate_ai_probability(line) if segment_type == 'text' else 0.0 segments.append(TextSegment( content=line, segment_type=segment_type, line_number=i, preserve_formatting=preserve, perplexity_score=perplexity, ai_probability=ai_prob )) return segments def _identify_line_type(self, line: str) -> str: """Enhanced line type identification.""" if not line.strip(): return 'empty' for pattern_name, pattern in self.patterns.items(): if pattern.match(line): return pattern_name return 'text' def _calculate_perplexity(self, text: str) -> float: """Calculate text perplexity as an AI detection metric.""" if not text.strip(): return 0.0 words = word_tokenize(text.lower()) if len(words) < 3: return 0.0 # Simple perplexity approximation based on word frequency patterns word_lengths = [len(word) for word in words if word.isalpha()] if not word_lengths: return 0.0 avg_length = np.mean(word_lengths) length_variance = np.var(word_lengths) # AI text tends to have more consistent word lengths (lower variance) perplexity = length_variance / (avg_length + 1e-6) return min(perplexity, 10.0) # Cap at 10 def _calculate_ai_probability(self, text: str) -> float: """Calculate probability that text is AI-generated.""" if not text.strip(): return 0.0 # Check for AI-typical patterns ai_indicators = 0 total_checks = 6 # 1. Consistent sentence structure sentences = sent_tokenize(text) if len(sentences) > 1: lengths = [len(sent.split()) for sent in sentences] if np.std(lengths) < 3: # Very consistent lengths ai_indicators += 1 # 2. Overuse of transitional phrases transitions = ['however', 'moreover', 'furthermore', 'additionally', 'consequently'] transition_count = sum(1 for trans in transitions if trans in text.lower()) if transition_count > len(sentences) * 0.3: ai_indicators += 1 # 3. Lack of contractions contractions = ["n't", "'ll", "'re", "'ve", "'d", "'m"] if not any(cont in text for cont in contractions) and len(text.split()) > 10: ai_indicators += 1 # 4. Overly formal language in casual contexts formal_words = ['utilize', 'facilitate', 'demonstrate', 'implement', 'comprehensive'] formal_count = sum(1 for word in formal_words if word in text.lower()) if formal_count > len(text.split()) * 0.1: ai_indicators += 1 # 5. Perfect grammar (rarely natural) if len(text) > 50 and not re.search(r'[.]{2,}|[!]{2,}|[?]{2,}', text): ai_indicators += 1 # 6. Repetitive phrasing patterns words = text.lower().split() if len(words) > 10: unique_words = len(set(words)) if unique_words / len(words) < 0.6: # Low lexical diversity ai_indicators += 1 return ai_indicators / total_checks def reconstruct_text(self, segments: List[TextSegment]) -> str: """Reconstruct text from processed segments.""" return '\n'.join(segment.content for segment in segments) def download_nltk_resources(): """Download required NLTK resources with comprehensive coverage.""" try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context resources = [ 'punkt', 'averaged_perceptron_tagger', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger_eng', 'stopwords', 'vader_lexicon', 'omw-1.4' ] for resource in resources: try: nltk.download(resource, quiet=True) logger.info(f"Successfully downloaded {resource}") except Exception as e: logger.warning(f"Could not download {resource}: {str(e)}") class StateOfTheArtHumanizer: """State-of-the-art humanizer with LATEST 2025 models.""" def __init__( self, sentence_model: str = 'fast', # 🚀 FAST: Uses MiniLM-L6-v2 (fast) paraphrase_model: str = 'fast', # 🎯 FAST: T5-Small text_generation_model: str = 'fast', # 🔥 FAST: T5-Small device: Optional[str] = None, enable_advanced_models: bool = True, # Always enabled for quality model_quality: str = 'fast' # 'premium', 'balanced', 'fast' ): """Initialize with latest 2025 state-of-the-art models.""" self.device = device or str(DEVICE) self.enable_advanced_models = enable_advanced_models self.model_quality = model_quality # Map model quality to specific models self.sentence_model_name = self._get_model_name('sentence_transformers', sentence_model) self.paraphrase_model_name = self._get_model_name('paraphrasing', paraphrase_model) self.text_gen_model_name = self._get_model_name('text_generation', text_generation_model) # Initialize models self.sentence_model = None self.paraphrase_models = {} self.text_gen_model = None logger.info(f"🚀 Initializing SOTA Humanizer with:") logger.info(f" 📊 Sentence Model: {self.sentence_model_name}") logger.info(f" 🧠 Paraphrase Model: {self.paraphrase_model_name}") logger.info(f" 🔥 Text Gen Model: {self.text_gen_model_name}") self._initialize_models() def _get_model_name(self, category: str, quality: str) -> str: """Get the actual model name from the quality setting.""" if quality in LATEST_MODELS[category]: return LATEST_MODELS[category][quality] else: # If specific model name provided, use it directly return quality def _initialize_models(self): """Initialize all models with error handling.""" try: # Initialize sentence transformer (BGE-M3 or fallback) logger.info(f"🔄 Loading sentence model: {self.sentence_model_name}") self.sentence_model = SentenceTransformer(self.sentence_model_name, device=self.device) logger.info("✅ Sentence model loaded successfully") # Initialize paraphrasing models self._initialize_paraphrase_models(self.paraphrase_model_name) # Initialize text generation model (if premium) if self.model_quality == 'premium' and self.enable_advanced_models: self._initialize_text_generation_model() except Exception as e: logger.error(f"❌ Model initialization failed: {e}") # Fallback to basic models self._initialize_fallback_models() def _initialize_fallback_models(self): """Initialize fallback models if latest ones fail.""" try: logger.info("🔄 Falling back to reliable models...") self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=self.device) self._initialize_paraphrase_models('google-t5/t5-small') logger.info("✅ Fallback models loaded successfully") except Exception as e: logger.error(f"❌ Even fallback models failed: {e}") def _initialize_text_generation_model(self): """Initialize latest text generation model (DeepSeek-R1 or Qwen3).""" try: if 'deepseek' in self.text_gen_model_name.lower(): logger.info(f"🚀 Loading DeepSeek model: {self.text_gen_model_name}") # For DeepSeek models, use specific configuration self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name) self.text_gen_model = AutoModelForCausalLM.from_pretrained( self.text_gen_model_name, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32, device_map='auto' if self.device != 'cpu' else None, trust_remote_code=True ) logger.info("✅ DeepSeek model loaded successfully") elif 'qwen' in self.text_gen_model_name.lower(): logger.info(f"🔥 Loading Qwen3 model: {self.text_gen_model_name}") # For Qwen models self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name) self.text_gen_model = AutoModelForCausalLM.from_pretrained( self.text_gen_model_name, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32, device_map='auto' if self.device != 'cpu' else None ) logger.info("✅ Qwen3 model loaded successfully") else: # Use pipeline for other models self.text_gen_pipeline = pipeline( "text2text-generation", model=self.text_gen_model_name, device=0 if self.device != 'cpu' else -1, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 ) logger.info("✅ Text generation pipeline loaded successfully") except Exception as e: logger.warning(f"⚠️ Advanced text generation model failed to load: {e}") self.text_gen_model = None def _initialize_paraphrase_models(self, model_name: str): """Initialize paraphrasing models with enhanced capabilities.""" try: if 'ul2' in model_name.lower(): # Special handling for UL2 model logger.info(f"🏆 Loading UL2 model: {model_name}") self.paraphrase_models['ul2'] = pipeline( "text2text-generation", model=model_name, device=0 if self.device != 'cpu' else -1, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 ) logger.info("✅ UL2 model loaded successfully") elif 'flan-t5' in model_name.lower(): # FLAN-T5 models logger.info(f"🎯 Loading FLAN-T5 model: {model_name}") self.paraphrase_models['flan_t5'] = pipeline( "text2text-generation", model=model_name, device=0 if self.device != 'cpu' else -1, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 ) logger.info("✅ FLAN-T5 model loaded successfully") else: # Standard T5 models self.paraphrase_models['t5'] = pipeline( "text2text-generation", model=model_name, device=0 if self.device != 'cpu' else -1, torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 ) logger.info("✅ T5 model loaded successfully") except Exception as e: logger.error(f"❌ Paraphrase model initialization failed: {e}") raise def paraphrase_sentence(self, sentence: str, model_type: str = 'auto') -> str: """Advanced paraphrasing with latest models.""" if not sentence.strip() or len(sentence.split()) < 5: # Skip very short sentences return sentence try: # Choose best available model if model_type == 'auto': if 'ul2' in self.paraphrase_models: model_type = 'ul2' elif 'flan_t5' in self.paraphrase_models: model_type = 'flan_t5' else: model_type = 't5' model = self.paraphrase_models.get(model_type) if not model: return sentence # Prepare input based on model type - use simple, clean prompts if model_type == 'ul2': input_text = f"Rewrite: {sentence}" elif model_type == 'flan_t5': input_text = f"Rewrite this text: {sentence}" else: # Standard T5 - use basic paraphrase prompt input_text = f"paraphrase: {sentence}" # Generate paraphrase with conservative settings result = model( input_text, max_length=min(len(sentence.split()) * 2 + 10, 100), # More conservative length min_length=max(3, len(sentence.split()) - 3), do_sample=True, temperature=0.6, # Lower temperature for more conservative outputs top_p=0.8, # Lower top_p num_return_sequences=1, no_repeat_ngram_size=2, repetition_penalty=1.1 ) paraphrased = result[0]['generated_text'].strip() # Enhanced quality checks if self._is_quality_paraphrase_enhanced(sentence, paraphrased): return paraphrased else: return sentence except Exception as e: logger.warning(f"⚠️ Paraphrasing failed: {e}") return sentence def _is_quality_paraphrase_enhanced(self, original: str, paraphrase: str) -> bool: """Enhanced quality check for paraphrases with stricter criteria.""" if not paraphrase or paraphrase.strip() == original.strip(): return False # Check for editorial markers or foreign language bad_markers = ['False:', 'Paraphrase:', 'True:', 'Note:', 'Edit:', '[', ']', 'Cette', 'loi', 'aux'] if any(marker in paraphrase for marker in bad_markers): return False # Check length ratio (shouldn't be too different) length_ratio = len(paraphrase) / len(original) if length_ratio < 0.5 or length_ratio > 2.0: return False # Check for broken words or missing spaces if any(len(word) > 20 for word in paraphrase.split()): # Very long words indicate concatenation return False # Check semantic similarity if available try: if self.sentence_model: embeddings = self.sentence_model.encode([original, paraphrase]) similarity = util.cos_sim(embeddings[0], embeddings[1]).item() # Stricter similarity thresholds if 'minilm' in self.sentence_model_name.lower(): return 0.7 <= similarity <= 0.95 # Good range for MiniLM else: return 0.65 <= similarity <= 0.95 return True # Fallback if no sentence model except Exception as e: logger.warning(f"⚠️ Quality check failed: {e}") return False def generate_with_latest_model(self, prompt: str, max_length: int = 150) -> str: """Generate text using the latest models (DeepSeek-R1 or Qwen3).""" if not self.text_gen_model: return prompt try: if hasattr(self, 'text_gen_tokenizer'): # Direct model inference for DeepSeek/Qwen inputs = self.text_gen_tokenizer.encode(prompt, return_tensors='pt') with torch.no_grad(): outputs = self.text_gen_model.generate( inputs, max_length=max_length, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=self.text_gen_tokenizer.eos_token_id ) generated = self.text_gen_tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the new generated part new_text = generated[len(prompt):].strip() return prompt + " " + new_text if new_text else prompt elif hasattr(self, 'text_gen_pipeline'): # Pipeline inference result = self.text_gen_pipeline( prompt, max_length=max_length, do_sample=True, temperature=0.7, top_p=0.9 ) return result[0]['generated_text'] except Exception as e: logger.warning(f"⚠️ Text generation failed: {e}") return prompt return prompt def _is_quality_paraphrase(self, original: str, paraphrase: str) -> bool: """Enhanced quality check for paraphrases using latest models.""" if not paraphrase or paraphrase.strip() == original.strip(): return False try: # Check semantic similarity using advanced model if self.sentence_model: embeddings = self.sentence_model.encode([original, paraphrase]) similarity = util.cos_sim(embeddings[0], embeddings[1]).item() # BGE-M3 and advanced models have different thresholds if 'bge-m3' in self.sentence_model_name.lower(): min_similarity = 0.7 # Higher threshold for BGE-M3 elif 'mpnet' in self.sentence_model_name.lower(): min_similarity = 0.65 # Medium threshold for MPNet else: min_similarity = 0.6 # Standard threshold return similarity >= min_similarity return True # Fallback if no sentence model except Exception as e: logger.warning(f"⚠️ Quality check failed: {e}") return True # Conservative fallback def enhance_with_advanced_synonyms(self, text: str) -> str: """Enhanced synonym replacement using latest models.""" if not text.strip(): return text try: doc = NLP_GLOBAL(text) enhanced_tokens = [] for token in doc: # Be more conservative with synonym replacement if (token.is_alpha and not token.is_stop and len(token.text) > 4 and token.pos_ in ['NOUN', 'VERB', 'ADJ'] and # Removed 'ADV' and increased min length not token.is_punct and token.lemma_.lower() not in ['say', 'get', 'make', 'take', 'come', 'go']): # Avoid common verbs # Use contextual synonym selection with lower probability if random.random() < 0.3: # Only 30% chance of replacement synonym = self._get_contextual_synonym_advanced( token.text, token.pos_, text, token.i ) if synonym and len(synonym) <= len(token.text) + 3: # Prevent very long replacements enhanced_tokens.append(synonym + token.whitespace_) else: enhanced_tokens.append(token.text_with_ws) else: enhanced_tokens.append(token.text_with_ws) else: enhanced_tokens.append(token.text_with_ws) result = ''.join(enhanced_tokens) # Quality check: ensure result is reasonable if len(result) > len(text) * 1.5: # Prevent text expansion beyond 150% return text return result except Exception as e: logger.warning(f"⚠️ Advanced synonym enhancement failed: {e}") return text def _get_contextual_synonym_advanced(self, word: str, pos: str, context: str, position: int) -> Optional[str]: """Advanced contextual synonym selection using latest models.""" try: # Get traditional synonyms first synonyms = self._get_wordnet_synonyms(word, pos) if not synonyms or not self.sentence_model: return None # Use advanced sentence model for context-aware selection original_sentence = context best_synonym = None best_score = -1 for synonym in synonyms[:5]: # Limit to top 5 for efficiency # Create candidate sentence with synonym words = context.split() if position < len(words): words[position] = synonym candidate_sentence = ' '.join(words) # Calculate semantic similarity embeddings = self.sentence_model.encode([original_sentence, candidate_sentence]) similarity = util.cos_sim(embeddings[0], embeddings[1]).item() # For advanced models, we want high similarity but some variation if 'bge-m3' in self.sentence_model_name.lower(): # BGE-M3 is more nuanced if 0.85 <= similarity <= 0.98 and similarity > best_score: best_score = similarity best_synonym = synonym else: # Standard models if 0.8 <= similarity <= 0.95 and similarity > best_score: best_score = similarity best_synonym = synonym return best_synonym except Exception as e: logger.warning(f"⚠️ Advanced contextual synonym selection failed: {e}") return None def _get_wordnet_synonyms(self, word: str, pos: str) -> List[str]: """Enhanced WordNet synonym extraction.""" try: # Map spaCy POS to WordNet POS pos_map = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV } wn_pos = pos_map.get(pos) if not wn_pos: return [] synonyms = set() synsets = wordnet.synsets(word.lower(), pos=wn_pos) for synset in synsets[:3]: # Top 3 synsets for lemma in synset.lemmas()[:4]: # Top 4 lemmas per synset synonym = lemma.name().replace('_', ' ') if synonym.lower() != word.lower() and len(synonym) > 2: synonyms.add(synonym) return list(synonyms) except Exception as e: logger.warning(f"⚠️ WordNet synonym extraction failed: {e}") return [] class AdvancedAcademicTextHumanizer: """ Next-generation text humanizer with state-of-the-art ML models and advanced AI detection avoidance techniques. """ def __init__( self, sentence_model: str = 'fast', # OPTIMIZED: Use fast models by default paraphrase_model: str = 'fast', # OPTIMIZED: Use fast models by default p_passive: float = 0.05, # REDUCED: Very conservative passive conversion p_synonym_replacement: float = 0.15, # REDUCED: Conservative synonym replacement p_academic_transition: float = 0.10, # REDUCED: Conservative transitions p_paraphrase: float = 0.10, # REDUCED: Conservative paraphrasing seed: Optional[int] = None, preserve_formatting: bool = True, enable_advanced_models: bool = True, # OPTIMIZED: Always enabled for quality ai_avoidance_mode: bool = True # OPTIMIZED: Always enabled for best results ): """ Initialize the advanced text humanizer with cutting-edge capabilities. """ if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) self.nlp = NLP_GLOBAL if self.nlp is None: raise RuntimeError("spaCy model not initialized. Call initialize_nlp() first.") # Initialize advanced models self.advanced_humanizer = StateOfTheArtHumanizer( sentence_model=sentence_model, paraphrase_model=paraphrase_model, enable_advanced_models=enable_advanced_models ) # Transformation probabilities with new advanced features self.p_passive = max(0.0, min(1.0, p_passive)) self.p_synonym_replacement = max(0.0, min(1.0, p_synonym_replacement)) self.p_academic_transition = max(0.0, min(1.0, p_academic_transition)) self.p_paraphrase = max(0.0, min(1.0, p_paraphrase)) self.preserve_formatting = preserve_formatting self.ai_avoidance_mode = ai_avoidance_mode self.markdown_preserver = AdvancedMarkdownPreserver() # Enhanced academic transitions with variety self.academic_transitions = { 'addition': [ "Moreover,", "Additionally,", "Furthermore,", "In addition,", "What's more,", "Beyond that,", "On top of that,", "Also worth noting," ], 'contrast': [ "However,", "Nevertheless,", "Nonetheless,", "Conversely,", "On the contrary,", "In contrast,", "That said,", "Yet," ], 'causation': [ "Therefore,", "Consequently,", "Thus,", "Hence,", "As a result,", "This leads to,", "It follows that,", "Accordingly," ], 'emphasis': [ "Notably,", "Significantly,", "Importantly,", "Remarkably,", "It's worth emphasizing,", "Particularly noteworthy,", "Crucially,", "Indeed," ], 'sequence': [ "Subsequently,", "Following this,", "Thereafter,", "Next,", "In the next phase,", "Moving forward,", "Then,", "Later on," ] } # Comprehensive contraction mapping self.contraction_map = { "n't": " not", "'re": " are", "'s": " is", "'ll": " will", "'ve": " have", "'d": " would", "'m": " am", "'t": " not", "won't": "will not", "can't": "cannot", "shouldn't": "should not", "wouldn't": "would not", "couldn't": "could not", "mustn't": "must not", "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "doesn't": "does not", "didn't": "did not", "don't": "do not", "let's": "let us", "that's": "that is", "there's": "there is", "here's": "here is", "what's": "what is", "where's": "where is", "who's": "who is", "it's": "it is" } def humanize_text( self, text: str, use_passive: bool = False, use_synonyms: bool = False, use_paraphrasing: bool = False, preserve_paragraphs: bool = True ) -> str: """ Advanced text humanization with state-of-the-art techniques. """ if not text or not text.strip(): return text try: if self.preserve_formatting: return self._humanize_with_advanced_preservation( text, use_passive, use_synonyms, use_paraphrasing, preserve_paragraphs ) else: return self._humanize_advanced_simple(text, use_passive, use_synonyms, use_paraphrasing) except Exception as e: logger.error(f"Error during advanced text humanization: {e}") return text def _humanize_with_advanced_preservation( self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool, preserve_paragraphs: bool ) -> str: """Advanced humanization with comprehensive formatting preservation.""" segments = self.markdown_preserver.segment_text(text) for segment in segments: if segment.segment_type == 'text' and segment.content.strip(): # Apply AI detection avoidance if needed if self.ai_avoidance_mode and segment.ai_probability > 0.6: segment.content = self._apply_ai_avoidance_techniques( segment.content, use_passive, use_synonyms, use_paraphrasing ) else: segment.content = self._transform_text_segment_advanced( segment.content, use_passive, use_synonyms, use_paraphrasing ) return self.markdown_preserver.reconstruct_text(segments) def _apply_ai_avoidance_techniques( self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool ) -> str: """Apply specialized techniques to avoid AI detection.""" try: # 1. Add natural imperfections text = self._add_natural_variations(text) # 2. Increase sentence variety text = self._vary_sentence_structure(text) # 3. Reduce formal language density text = self._reduce_formality(text) # 4. Apply standard transformations text = self._transform_text_segment_advanced( text, use_passive, use_synonyms, use_paraphrasing ) return text except Exception as e: logger.warning(f"Error in AI avoidance: {e}") return text def _add_natural_variations(self, text: str) -> str: """Add natural human-like variations.""" # Add occasional contractions to balance formality if random.random() < 0.3: formal_replacements = { "do not": "don't", "will not": "won't", "cannot": "can't", "should not": "shouldn't", "would not": "wouldn't" } for formal, contraction in formal_replacements.items(): if formal in text and random.random() < 0.4: text = text.replace(formal, contraction, 1) return text def _vary_sentence_structure(self, text: str) -> str: """Increase sentence structure variety.""" sentences = sent_tokenize(text) if len(sentences) < 2: return text varied_sentences = [] for i, sentence in enumerate(sentences): if i > 0 and random.random() < 0.3: # Occasionally start with different structures starters = ["Well,", "Actually,", "Interestingly,", "To be clear,"] if not any(sentence.startswith(starter) for starter in starters): starter = random.choice(starters) sentence = f"{starter} {sentence.lower()}" varied_sentences.append(sentence) return ' '.join(varied_sentences) def _reduce_formality(self, text: str) -> str: """Reduce excessive formality to appear more human.""" # Replace overly formal words with more natural alternatives formal_to_natural = { 'utilize': 'use', 'facilitate': 'help', 'demonstrate': 'show', 'implement': 'put in place', 'comprehensive': 'complete', 'methodology': 'method', 'substantial': 'large', 'numerous': 'many', 'acquire': 'get' } for formal, natural in formal_to_natural.items(): if formal in text.lower() and random.random() < 0.6: text = re.sub(r'\b' + formal + r'\b', natural, text, flags=re.IGNORECASE) return text def _transform_text_segment_advanced( self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool ) -> str: """Advanced text segment transformation with ML models.""" try: doc = self.nlp(text) transformed_sentences = [] for sent in doc.sents: sentence_str = sent.text.strip() if not sentence_str: continue # 1. Expand contractions sentence_str = self.expand_contractions_advanced(sentence_str) # 2. Advanced paraphrasing (new!) if use_paraphrasing and random.random() < self.p_paraphrase: paraphrased = self.advanced_humanizer.paraphrase_sentence(sentence_str) if paraphrased != sentence_str: sentence_str = paraphrased # 3. Context-aware academic transitions if random.random() < self.p_academic_transition: sentence_str = self.add_contextual_transitions(sentence_str) # 4. Advanced passive voice conversion if use_passive and random.random() < self.p_passive: sentence_str = self.convert_to_passive_advanced(sentence_str) # 5. Enhanced contextual synonym replacement if use_synonyms and random.random() < self.p_synonym_replacement: sentence_str = self.enhance_with_advanced_synonyms(sentence_str) transformed_sentences.append(sentence_str) result = ' '.join(transformed_sentences) return result if result.strip() else text except Exception as e: logger.warning(f"Error in advanced transformation: {e}") return text def expand_contractions_advanced(self, sentence: str) -> str: """Enhanced contraction expansion with better context handling.""" # Handle special cases with regex for better accuracy for contraction, expansion in self.contraction_map.items(): if len(contraction) > 3: # Full word contractions pattern = r'\b' + re.escape(contraction) + r'\b' sentence = re.sub(pattern, expansion, sentence, flags=re.IGNORECASE) # Handle suffix contractions tokens = word_tokenize(sentence) expanded_tokens = [] for token in tokens: original_case = token lower_token = token.lower() replaced = False for contraction, expansion in self.contraction_map.items(): if (len(contraction) <= 3 and lower_token.endswith(contraction) and len(lower_token) > len(contraction)): base = lower_token[:-len(contraction)] new_token = base + expansion # Preserve capitalization pattern if original_case[0].isupper(): new_token = new_token[0].upper() + new_token[1:] expanded_tokens.append(new_token) replaced = True break if not replaced: expanded_tokens.append(token) return ' '.join(expanded_tokens) def add_contextual_transitions(self, sentence: str) -> str: """Add contextually intelligent academic transitions.""" sentence_lower = sentence.lower() # Enhanced context detection context_patterns = { 'contrast': ['but', 'however', 'although', 'while', 'despite', 'whereas'], 'causation': ['because', 'since', 'therefore', 'so', 'due to', 'as a result'], 'addition': ['also', 'and', 'plus', 'including', 'along with'], 'emphasis': ['important', 'significant', 'notable', 'crucial', 'key'], 'sequence': ['first', 'second', 'then', 'next', 'finally', 'last'] } # Determine best transition type best_type = 'addition' # default max_matches = 0 for transition_type, patterns in context_patterns.items(): matches = sum(1 for pattern in patterns if pattern in sentence_lower) if matches > max_matches: max_matches = matches best_type = transition_type # Select appropriate transition transition = random.choice(self.academic_transitions[best_type]) return f"{transition} {sentence}" def convert_to_passive_advanced(self, sentence: str) -> str: """Advanced passive voice conversion with better grammatical accuracy.""" try: doc = self.nlp(sentence) # Find suitable active voice patterns for token in doc: if (token.pos_ == 'VERB' and token.dep_ == 'ROOT' and token.tag_ in ['VBD', 'VBZ', 'VBP']): # Find subject and object subj = None obj = None for child in token.children: if child.dep_ == 'nsubj': subj = child elif child.dep_ in ['dobj', 'pobj']: obj = child if subj and obj: # Create passive transformation verb_base = token.lemma_ # Choose auxiliary verb aux = 'was' if subj.tag_ in ['NN', 'NNP'] else 'were' if token.tag_ in ['VBZ', 'VBP']: # Present tense aux = 'is' if subj.tag_ in ['NN', 'NNP'] else 'are' # Create past participle if verb_base.endswith('e'): past_participle = verb_base + 'd' elif verb_base in ['go', 'do', 'be', 'have']: # Irregular verbs irregular_map = {'go': 'gone', 'do': 'done', 'be': 'been', 'have': 'had'} past_participle = irregular_map.get(verb_base, verb_base + 'ed') else: past_participle = verb_base + 'ed' # Construct passive sentence passive_phrase = f"{obj.text} {aux} {past_participle} by {subj.text}" # Replace in original sentence original_phrase = f"{subj.text} {token.text} {obj.text}" if original_phrase in sentence: return sentence.replace(original_phrase, passive_phrase) return sentence except Exception as e: logger.warning(f"Error in advanced passive conversion: {e}") return sentence def get_advanced_transformation_stats(self, original_text: str, transformed_text: str) -> Dict[str, Union[int, float]]: """Get comprehensive transformation statistics with ML analysis.""" orig_tokens = word_tokenize(original_text) trans_tokens = word_tokenize(transformed_text) orig_sents = sent_tokenize(original_text) trans_sents = sent_tokenize(transformed_text) # Calculate advanced metrics stats = { 'original_word_count': len(orig_tokens), 'transformed_word_count': len(trans_tokens), 'original_sentence_count': len(orig_sents), 'transformed_sentence_count': len(trans_sents), 'word_change_ratio': len(trans_tokens) / len(orig_tokens) if orig_tokens else 0, 'sentence_change_ratio': len(trans_sents) / len(orig_sents) if orig_sents else 0, 'character_count_original': len(original_text), 'character_count_transformed': len(transformed_text), } # Add ML-based analysis try: # Semantic similarity if hasattr(self, 'advanced_humanizer') and self.advanced_humanizer.sentence_model: embeddings = self.advanced_humanizer.sentence_model.encode([original_text, transformed_text]) semantic_similarity = float(util.cos_sim(embeddings[0], embeddings[1]).item()) stats['semantic_similarity'] = semantic_similarity # AI detection metrics original_segments = self.markdown_preserver.segment_text(original_text) transformed_segments = self.markdown_preserver.segment_text(transformed_text) orig_ai_scores = [seg.ai_probability for seg in original_segments if seg.segment_type == 'text'] trans_ai_scores = [seg.ai_probability for seg in transformed_segments if seg.segment_type == 'text'] if orig_ai_scores and trans_ai_scores: stats['original_ai_probability'] = np.mean(orig_ai_scores) stats['transformed_ai_probability'] = np.mean(trans_ai_scores) stats['ai_detection_improvement'] = stats['original_ai_probability'] - stats['transformed_ai_probability'] except Exception as e: logger.warning(f"Error calculating advanced stats: {e}") return stats def _humanize_advanced_simple(self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str: """Simple advanced transformation without formatting preservation.""" paragraphs = text.split('\n\n') transformed_paragraphs = [] for paragraph in paragraphs: if paragraph.strip(): transformed = self._transform_text_segment_advanced( paragraph, use_passive, use_synonyms, use_paraphrasing ) transformed_paragraphs.append(transformed) else: transformed_paragraphs.append(paragraph) return '\n\n'.join(transformed_paragraphs)