Spaces:
Sleeping
Sleeping
""" | |
Advanced Academic Text Humanizer with State-of-the-Art ML Models | |
This module provides cutting-edge text transformation capabilities using the latest | |
ML models for superior AI text humanization, including T5 paraphrasing, advanced | |
sentence transformers, and AI detection avoidance techniques. | |
""" | |
import ssl | |
import random | |
import warnings | |
import re | |
import logging | |
import math | |
from typing import List, Dict, Tuple, Optional, Union | |
from dataclasses import dataclass | |
from functools import lru_cache | |
import nltk | |
import spacy | |
import torch | |
import numpy as np | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import wordnet, stopwords | |
from sentence_transformers import SentenceTransformer, util | |
from transformers import ( | |
T5ForConditionalGeneration, T5Tokenizer, | |
PegasusForConditionalGeneration, PegasusTokenizer, | |
pipeline, AutoTokenizer, AutoModelForCausalLM | |
) | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Suppress warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
warnings.filterwarnings("ignore", category=UserWarning) | |
# Global models | |
NLP_GLOBAL = None | |
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu") | |
# Latest state-of-the-art models configuration | |
LATEST_MODELS = { | |
'sentence_transformers': { | |
'premium': 'sentence-transformers/all-MiniLM-L12-v2', # Lighter premium option | |
'balanced': 'sentence-transformers/all-MiniLM-L6-v2', # Fast and reliable | |
'fast': 'sentence-transformers/all-MiniLM-L6-v2' # Same as balanced for consistency | |
}, | |
'paraphrasing': { | |
'premium': 'google-t5/t5-base', # Much lighter than UL2 | |
'balanced': 'google-t5/t5-small', # Good balance | |
'fast': 'google-t5/t5-small' # Fast and efficient | |
}, | |
'text_generation': { | |
'premium': 'google-t5/t5-base', # Much lighter than 70B models | |
'balanced': 'google-t5/t5-small', # Small and fast | |
'fast': 'google-t5/t5-small' # Consistent with balanced | |
} | |
} | |
def initialize_nlp(): | |
"""Initialize the global NLP model with enhanced capabilities.""" | |
global NLP_GLOBAL | |
if NLP_GLOBAL is None: | |
try: | |
NLP_GLOBAL = spacy.load("en_core_web_sm") | |
logger.info("Successfully loaded spaCy model") | |
except Exception as e: | |
logger.error(f"Failed to load spaCy model: {e}") | |
raise | |
# Initialize on import | |
try: | |
initialize_nlp() | |
except Exception as e: | |
logger.warning(f"Could not initialize NLP model: {e}") | |
class TextSegment: | |
"""Enhanced text segment with additional metadata.""" | |
content: str | |
segment_type: str # 'text', 'markdown', 'code', 'list', 'header' | |
line_number: int | |
preserve_formatting: bool = False | |
perplexity_score: float = 0.0 | |
ai_probability: float = 0.0 | |
class AdvancedMarkdownPreserver: | |
"""Enhanced markdown preservation with better pattern recognition.""" | |
def __init__(self): | |
self.patterns = { | |
'code_block': re.compile(r'```[\s\S]*?```', re.MULTILINE), | |
'inline_code': re.compile(r'`[^`]+`'), | |
'header': re.compile(r'^#{1,6}\s+.*$', re.MULTILINE), | |
'list_item': re.compile(r'^\s*[-*+]\s+.*$', re.MULTILINE), | |
'numbered_list': re.compile(r'^\s*\d+\.\s+.*$', re.MULTILINE), | |
'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'), | |
'bold': re.compile(r'\*\*([^*]+)\*\*'), | |
'italic': re.compile(r'\*([^*]+)\*'), | |
'blockquote': re.compile(r'^>\s+.*$', re.MULTILINE), | |
'horizontal_rule': re.compile(r'^---+$', re.MULTILINE), | |
'table_row': re.compile(r'^\s*\|.*\|\s*$', re.MULTILINE), | |
'latex_math': re.compile(r'\$\$.*?\$\$|\$.*?\$', re.DOTALL), | |
'footnote': re.compile(r'\[\^[^\]]+\]'), | |
} | |
def segment_text(self, text: str) -> List[TextSegment]: | |
"""Segment text with enhanced analysis.""" | |
segments = [] | |
lines = text.split('\n') | |
for i, line in enumerate(lines): | |
segment_type = self._identify_line_type(line) | |
preserve = segment_type != 'text' | |
# Calculate perplexity and AI probability for text segments | |
perplexity = self._calculate_perplexity(line) if segment_type == 'text' else 0.0 | |
ai_prob = self._calculate_ai_probability(line) if segment_type == 'text' else 0.0 | |
segments.append(TextSegment( | |
content=line, | |
segment_type=segment_type, | |
line_number=i, | |
preserve_formatting=preserve, | |
perplexity_score=perplexity, | |
ai_probability=ai_prob | |
)) | |
return segments | |
def _identify_line_type(self, line: str) -> str: | |
"""Enhanced line type identification.""" | |
if not line.strip(): | |
return 'empty' | |
for pattern_name, pattern in self.patterns.items(): | |
if pattern.match(line): | |
return pattern_name | |
return 'text' | |
def _calculate_perplexity(self, text: str) -> float: | |
"""Calculate text perplexity as an AI detection metric.""" | |
if not text.strip(): | |
return 0.0 | |
words = word_tokenize(text.lower()) | |
if len(words) < 3: | |
return 0.0 | |
# Simple perplexity approximation based on word frequency patterns | |
word_lengths = [len(word) for word in words if word.isalpha()] | |
if not word_lengths: | |
return 0.0 | |
avg_length = np.mean(word_lengths) | |
length_variance = np.var(word_lengths) | |
# AI text tends to have more consistent word lengths (lower variance) | |
perplexity = length_variance / (avg_length + 1e-6) | |
return min(perplexity, 10.0) # Cap at 10 | |
def _calculate_ai_probability(self, text: str) -> float: | |
"""Calculate probability that text is AI-generated.""" | |
if not text.strip(): | |
return 0.0 | |
# Check for AI-typical patterns | |
ai_indicators = 0 | |
total_checks = 6 | |
# 1. Consistent sentence structure | |
sentences = sent_tokenize(text) | |
if len(sentences) > 1: | |
lengths = [len(sent.split()) for sent in sentences] | |
if np.std(lengths) < 3: # Very consistent lengths | |
ai_indicators += 1 | |
# 2. Overuse of transitional phrases | |
transitions = ['however', 'moreover', 'furthermore', 'additionally', 'consequently'] | |
transition_count = sum(1 for trans in transitions if trans in text.lower()) | |
if transition_count > len(sentences) * 0.3: | |
ai_indicators += 1 | |
# 3. Lack of contractions | |
contractions = ["n't", "'ll", "'re", "'ve", "'d", "'m"] | |
if not any(cont in text for cont in contractions) and len(text.split()) > 10: | |
ai_indicators += 1 | |
# 4. Overly formal language in casual contexts | |
formal_words = ['utilize', 'facilitate', 'demonstrate', 'implement', 'comprehensive'] | |
formal_count = sum(1 for word in formal_words if word in text.lower()) | |
if formal_count > len(text.split()) * 0.1: | |
ai_indicators += 1 | |
# 5. Perfect grammar (rarely natural) | |
if len(text) > 50 and not re.search(r'[.]{2,}|[!]{2,}|[?]{2,}', text): | |
ai_indicators += 1 | |
# 6. Repetitive phrasing patterns | |
words = text.lower().split() | |
if len(words) > 10: | |
unique_words = len(set(words)) | |
if unique_words / len(words) < 0.6: # Low lexical diversity | |
ai_indicators += 1 | |
return ai_indicators / total_checks | |
def reconstruct_text(self, segments: List[TextSegment]) -> str: | |
"""Reconstruct text from processed segments.""" | |
return '\n'.join(segment.content for segment in segments) | |
def download_nltk_resources(): | |
"""Download required NLTK resources with comprehensive coverage.""" | |
try: | |
_create_unverified_https_context = ssl._create_unverified_context | |
except AttributeError: | |
pass | |
else: | |
ssl._create_default_https_context = _create_unverified_https_context | |
resources = [ | |
'punkt', 'averaged_perceptron_tagger', 'punkt_tab', | |
'wordnet', 'averaged_perceptron_tagger_eng', 'stopwords', | |
'vader_lexicon', 'omw-1.4' | |
] | |
for resource in resources: | |
try: | |
nltk.download(resource, quiet=True) | |
logger.info(f"Successfully downloaded {resource}") | |
except Exception as e: | |
logger.warning(f"Could not download {resource}: {str(e)}") | |
class StateOfTheArtHumanizer: | |
"""State-of-the-art humanizer with LATEST 2025 models.""" | |
def __init__( | |
self, | |
sentence_model: str = 'fast', # 🚀 FAST: Uses MiniLM-L6-v2 (fast) | |
paraphrase_model: str = 'fast', # 🎯 FAST: T5-Small | |
text_generation_model: str = 'fast', # 🔥 FAST: T5-Small | |
device: Optional[str] = None, | |
enable_advanced_models: bool = True, # Always enabled for quality | |
model_quality: str = 'fast' # 'premium', 'balanced', 'fast' | |
): | |
"""Initialize with latest 2025 state-of-the-art models.""" | |
self.device = device or str(DEVICE) | |
self.enable_advanced_models = enable_advanced_models | |
self.model_quality = model_quality | |
# Map model quality to specific models | |
self.sentence_model_name = self._get_model_name('sentence_transformers', sentence_model) | |
self.paraphrase_model_name = self._get_model_name('paraphrasing', paraphrase_model) | |
self.text_gen_model_name = self._get_model_name('text_generation', text_generation_model) | |
# Initialize models | |
self.sentence_model = None | |
self.paraphrase_models = {} | |
self.text_gen_model = None | |
logger.info(f"🚀 Initializing SOTA Humanizer with:") | |
logger.info(f" 📊 Sentence Model: {self.sentence_model_name}") | |
logger.info(f" 🧠 Paraphrase Model: {self.paraphrase_model_name}") | |
logger.info(f" 🔥 Text Gen Model: {self.text_gen_model_name}") | |
self._initialize_models() | |
def _get_model_name(self, category: str, quality: str) -> str: | |
"""Get the actual model name from the quality setting.""" | |
if quality in LATEST_MODELS[category]: | |
return LATEST_MODELS[category][quality] | |
else: | |
# If specific model name provided, use it directly | |
return quality | |
def _initialize_models(self): | |
"""Initialize all models with error handling.""" | |
try: | |
# Initialize sentence transformer (BGE-M3 or fallback) | |
logger.info(f"🔄 Loading sentence model: {self.sentence_model_name}") | |
self.sentence_model = SentenceTransformer(self.sentence_model_name, device=self.device) | |
logger.info("✅ Sentence model loaded successfully") | |
# Initialize paraphrasing models | |
self._initialize_paraphrase_models(self.paraphrase_model_name) | |
# Initialize text generation model (if premium) | |
if self.model_quality == 'premium' and self.enable_advanced_models: | |
self._initialize_text_generation_model() | |
except Exception as e: | |
logger.error(f"❌ Model initialization failed: {e}") | |
# Fallback to basic models | |
self._initialize_fallback_models() | |
def _initialize_fallback_models(self): | |
"""Initialize fallback models if latest ones fail.""" | |
try: | |
logger.info("🔄 Falling back to reliable models...") | |
self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=self.device) | |
self._initialize_paraphrase_models('google-t5/t5-small') | |
logger.info("✅ Fallback models loaded successfully") | |
except Exception as e: | |
logger.error(f"❌ Even fallback models failed: {e}") | |
def _initialize_text_generation_model(self): | |
"""Initialize latest text generation model (DeepSeek-R1 or Qwen3).""" | |
try: | |
if 'deepseek' in self.text_gen_model_name.lower(): | |
logger.info(f"🚀 Loading DeepSeek model: {self.text_gen_model_name}") | |
# For DeepSeek models, use specific configuration | |
self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name) | |
self.text_gen_model = AutoModelForCausalLM.from_pretrained( | |
self.text_gen_model_name, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32, | |
device_map='auto' if self.device != 'cpu' else None, | |
trust_remote_code=True | |
) | |
logger.info("✅ DeepSeek model loaded successfully") | |
elif 'qwen' in self.text_gen_model_name.lower(): | |
logger.info(f"🔥 Loading Qwen3 model: {self.text_gen_model_name}") | |
# For Qwen models | |
self.text_gen_tokenizer = AutoTokenizer.from_pretrained(self.text_gen_model_name) | |
self.text_gen_model = AutoModelForCausalLM.from_pretrained( | |
self.text_gen_model_name, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32, | |
device_map='auto' if self.device != 'cpu' else None | |
) | |
logger.info("✅ Qwen3 model loaded successfully") | |
else: | |
# Use pipeline for other models | |
self.text_gen_pipeline = pipeline( | |
"text2text-generation", | |
model=self.text_gen_model_name, | |
device=0 if self.device != 'cpu' else -1, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 | |
) | |
logger.info("✅ Text generation pipeline loaded successfully") | |
except Exception as e: | |
logger.warning(f"⚠️ Advanced text generation model failed to load: {e}") | |
self.text_gen_model = None | |
def _initialize_paraphrase_models(self, model_name: str): | |
"""Initialize paraphrasing models with enhanced capabilities.""" | |
try: | |
if 'ul2' in model_name.lower(): | |
# Special handling for UL2 model | |
logger.info(f"🏆 Loading UL2 model: {model_name}") | |
self.paraphrase_models['ul2'] = pipeline( | |
"text2text-generation", | |
model=model_name, | |
device=0 if self.device != 'cpu' else -1, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 | |
) | |
logger.info("✅ UL2 model loaded successfully") | |
elif 'flan-t5' in model_name.lower(): | |
# FLAN-T5 models | |
logger.info(f"🎯 Loading FLAN-T5 model: {model_name}") | |
self.paraphrase_models['flan_t5'] = pipeline( | |
"text2text-generation", | |
model=model_name, | |
device=0 if self.device != 'cpu' else -1, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 | |
) | |
logger.info("✅ FLAN-T5 model loaded successfully") | |
else: | |
# Standard T5 models | |
self.paraphrase_models['t5'] = pipeline( | |
"text2text-generation", | |
model=model_name, | |
device=0 if self.device != 'cpu' else -1, | |
torch_dtype=torch.float16 if self.device != 'cpu' else torch.float32 | |
) | |
logger.info("✅ T5 model loaded successfully") | |
except Exception as e: | |
logger.error(f"❌ Paraphrase model initialization failed: {e}") | |
raise | |
def paraphrase_sentence(self, sentence: str, model_type: str = 'auto') -> str: | |
"""Advanced paraphrasing with latest models.""" | |
if not sentence.strip() or len(sentence.split()) < 5: # Skip very short sentences | |
return sentence | |
try: | |
# Choose best available model | |
if model_type == 'auto': | |
if 'ul2' in self.paraphrase_models: | |
model_type = 'ul2' | |
elif 'flan_t5' in self.paraphrase_models: | |
model_type = 'flan_t5' | |
else: | |
model_type = 't5' | |
model = self.paraphrase_models.get(model_type) | |
if not model: | |
return sentence | |
# Prepare input based on model type - use simple, clean prompts | |
if model_type == 'ul2': | |
input_text = f"Rewrite: {sentence}" | |
elif model_type == 'flan_t5': | |
input_text = f"Rewrite this text: {sentence}" | |
else: | |
# Standard T5 - use basic paraphrase prompt | |
input_text = f"paraphrase: {sentence}" | |
# Generate paraphrase with conservative settings | |
result = model( | |
input_text, | |
max_length=min(len(sentence.split()) * 2 + 10, 100), # More conservative length | |
min_length=max(3, len(sentence.split()) - 3), | |
do_sample=True, | |
temperature=0.6, # Lower temperature for more conservative outputs | |
top_p=0.8, # Lower top_p | |
num_return_sequences=1, | |
no_repeat_ngram_size=2, | |
repetition_penalty=1.1 | |
) | |
paraphrased = result[0]['generated_text'].strip() | |
# Enhanced quality checks | |
if self._is_quality_paraphrase_enhanced(sentence, paraphrased): | |
return paraphrased | |
else: | |
return sentence | |
except Exception as e: | |
logger.warning(f"⚠️ Paraphrasing failed: {e}") | |
return sentence | |
def _is_quality_paraphrase_enhanced(self, original: str, paraphrase: str) -> bool: | |
"""Enhanced quality check for paraphrases with stricter criteria.""" | |
if not paraphrase or paraphrase.strip() == original.strip(): | |
return False | |
# Check for editorial markers or foreign language | |
bad_markers = ['False:', 'Paraphrase:', 'True:', 'Note:', 'Edit:', '[', ']', 'Cette', 'loi', 'aux'] | |
if any(marker in paraphrase for marker in bad_markers): | |
return False | |
# Check length ratio (shouldn't be too different) | |
length_ratio = len(paraphrase) / len(original) | |
if length_ratio < 0.5 or length_ratio > 2.0: | |
return False | |
# Check for broken words or missing spaces | |
if any(len(word) > 20 for word in paraphrase.split()): # Very long words indicate concatenation | |
return False | |
# Check semantic similarity if available | |
try: | |
if self.sentence_model: | |
embeddings = self.sentence_model.encode([original, paraphrase]) | |
similarity = util.cos_sim(embeddings[0], embeddings[1]).item() | |
# Stricter similarity thresholds | |
if 'minilm' in self.sentence_model_name.lower(): | |
return 0.7 <= similarity <= 0.95 # Good range for MiniLM | |
else: | |
return 0.65 <= similarity <= 0.95 | |
return True # Fallback if no sentence model | |
except Exception as e: | |
logger.warning(f"⚠️ Quality check failed: {e}") | |
return False | |
def generate_with_latest_model(self, prompt: str, max_length: int = 150) -> str: | |
"""Generate text using the latest models (DeepSeek-R1 or Qwen3).""" | |
if not self.text_gen_model: | |
return prompt | |
try: | |
if hasattr(self, 'text_gen_tokenizer'): | |
# Direct model inference for DeepSeek/Qwen | |
inputs = self.text_gen_tokenizer.encode(prompt, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = self.text_gen_model.generate( | |
inputs, | |
max_length=max_length, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
pad_token_id=self.text_gen_tokenizer.eos_token_id | |
) | |
generated = self.text_gen_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Extract only the new generated part | |
new_text = generated[len(prompt):].strip() | |
return prompt + " " + new_text if new_text else prompt | |
elif hasattr(self, 'text_gen_pipeline'): | |
# Pipeline inference | |
result = self.text_gen_pipeline( | |
prompt, | |
max_length=max_length, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9 | |
) | |
return result[0]['generated_text'] | |
except Exception as e: | |
logger.warning(f"⚠️ Text generation failed: {e}") | |
return prompt | |
return prompt | |
def _is_quality_paraphrase(self, original: str, paraphrase: str) -> bool: | |
"""Enhanced quality check for paraphrases using latest models.""" | |
if not paraphrase or paraphrase.strip() == original.strip(): | |
return False | |
try: | |
# Check semantic similarity using advanced model | |
if self.sentence_model: | |
embeddings = self.sentence_model.encode([original, paraphrase]) | |
similarity = util.cos_sim(embeddings[0], embeddings[1]).item() | |
# BGE-M3 and advanced models have different thresholds | |
if 'bge-m3' in self.sentence_model_name.lower(): | |
min_similarity = 0.7 # Higher threshold for BGE-M3 | |
elif 'mpnet' in self.sentence_model_name.lower(): | |
min_similarity = 0.65 # Medium threshold for MPNet | |
else: | |
min_similarity = 0.6 # Standard threshold | |
return similarity >= min_similarity | |
return True # Fallback if no sentence model | |
except Exception as e: | |
logger.warning(f"⚠️ Quality check failed: {e}") | |
return True # Conservative fallback | |
def enhance_with_advanced_synonyms(self, text: str) -> str: | |
"""Enhanced synonym replacement using latest models.""" | |
if not text.strip(): | |
return text | |
try: | |
doc = NLP_GLOBAL(text) | |
enhanced_tokens = [] | |
for token in doc: | |
# Be more conservative with synonym replacement | |
if (token.is_alpha and not token.is_stop and | |
len(token.text) > 4 and token.pos_ in ['NOUN', 'VERB', 'ADJ'] and # Removed 'ADV' and increased min length | |
not token.is_punct and token.lemma_.lower() not in ['say', 'get', 'make', 'take', 'come', 'go']): # Avoid common verbs | |
# Use contextual synonym selection with lower probability | |
if random.random() < 0.3: # Only 30% chance of replacement | |
synonym = self._get_contextual_synonym_advanced( | |
token.text, token.pos_, text, token.i | |
) | |
if synonym and len(synonym) <= len(token.text) + 3: # Prevent very long replacements | |
enhanced_tokens.append(synonym + token.whitespace_) | |
else: | |
enhanced_tokens.append(token.text_with_ws) | |
else: | |
enhanced_tokens.append(token.text_with_ws) | |
else: | |
enhanced_tokens.append(token.text_with_ws) | |
result = ''.join(enhanced_tokens) | |
# Quality check: ensure result is reasonable | |
if len(result) > len(text) * 1.5: # Prevent text expansion beyond 150% | |
return text | |
return result | |
except Exception as e: | |
logger.warning(f"⚠️ Advanced synonym enhancement failed: {e}") | |
return text | |
def _get_contextual_synonym_advanced(self, word: str, pos: str, context: str, position: int) -> Optional[str]: | |
"""Advanced contextual synonym selection using latest models.""" | |
try: | |
# Get traditional synonyms first | |
synonyms = self._get_wordnet_synonyms(word, pos) | |
if not synonyms or not self.sentence_model: | |
return None | |
# Use advanced sentence model for context-aware selection | |
original_sentence = context | |
best_synonym = None | |
best_score = -1 | |
for synonym in synonyms[:5]: # Limit to top 5 for efficiency | |
# Create candidate sentence with synonym | |
words = context.split() | |
if position < len(words): | |
words[position] = synonym | |
candidate_sentence = ' '.join(words) | |
# Calculate semantic similarity | |
embeddings = self.sentence_model.encode([original_sentence, candidate_sentence]) | |
similarity = util.cos_sim(embeddings[0], embeddings[1]).item() | |
# For advanced models, we want high similarity but some variation | |
if 'bge-m3' in self.sentence_model_name.lower(): | |
# BGE-M3 is more nuanced | |
if 0.85 <= similarity <= 0.98 and similarity > best_score: | |
best_score = similarity | |
best_synonym = synonym | |
else: | |
# Standard models | |
if 0.8 <= similarity <= 0.95 and similarity > best_score: | |
best_score = similarity | |
best_synonym = synonym | |
return best_synonym | |
except Exception as e: | |
logger.warning(f"⚠️ Advanced contextual synonym selection failed: {e}") | |
return None | |
def _get_wordnet_synonyms(self, word: str, pos: str) -> List[str]: | |
"""Enhanced WordNet synonym extraction.""" | |
try: | |
# Map spaCy POS to WordNet POS | |
pos_map = { | |
'NOUN': wordnet.NOUN, | |
'VERB': wordnet.VERB, | |
'ADJ': wordnet.ADJ, | |
'ADV': wordnet.ADV | |
} | |
wn_pos = pos_map.get(pos) | |
if not wn_pos: | |
return [] | |
synonyms = set() | |
synsets = wordnet.synsets(word.lower(), pos=wn_pos) | |
for synset in synsets[:3]: # Top 3 synsets | |
for lemma in synset.lemmas()[:4]: # Top 4 lemmas per synset | |
synonym = lemma.name().replace('_', ' ') | |
if synonym.lower() != word.lower() and len(synonym) > 2: | |
synonyms.add(synonym) | |
return list(synonyms) | |
except Exception as e: | |
logger.warning(f"⚠️ WordNet synonym extraction failed: {e}") | |
return [] | |
class AdvancedAcademicTextHumanizer: | |
""" | |
Next-generation text humanizer with state-of-the-art ML models and | |
advanced AI detection avoidance techniques. | |
""" | |
def __init__( | |
self, | |
sentence_model: str = 'fast', # OPTIMIZED: Use fast models by default | |
paraphrase_model: str = 'fast', # OPTIMIZED: Use fast models by default | |
p_passive: float = 0.05, # REDUCED: Very conservative passive conversion | |
p_synonym_replacement: float = 0.15, # REDUCED: Conservative synonym replacement | |
p_academic_transition: float = 0.10, # REDUCED: Conservative transitions | |
p_paraphrase: float = 0.10, # REDUCED: Conservative paraphrasing | |
seed: Optional[int] = None, | |
preserve_formatting: bool = True, | |
enable_advanced_models: bool = True, # OPTIMIZED: Always enabled for quality | |
ai_avoidance_mode: bool = True # OPTIMIZED: Always enabled for best results | |
): | |
""" | |
Initialize the advanced text humanizer with cutting-edge capabilities. | |
""" | |
if seed is not None: | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
self.nlp = NLP_GLOBAL | |
if self.nlp is None: | |
raise RuntimeError("spaCy model not initialized. Call initialize_nlp() first.") | |
# Initialize advanced models | |
self.advanced_humanizer = StateOfTheArtHumanizer( | |
sentence_model=sentence_model, | |
paraphrase_model=paraphrase_model, | |
enable_advanced_models=enable_advanced_models | |
) | |
# Transformation probabilities with new advanced features | |
self.p_passive = max(0.0, min(1.0, p_passive)) | |
self.p_synonym_replacement = max(0.0, min(1.0, p_synonym_replacement)) | |
self.p_academic_transition = max(0.0, min(1.0, p_academic_transition)) | |
self.p_paraphrase = max(0.0, min(1.0, p_paraphrase)) | |
self.preserve_formatting = preserve_formatting | |
self.ai_avoidance_mode = ai_avoidance_mode | |
self.markdown_preserver = AdvancedMarkdownPreserver() | |
# Enhanced academic transitions with variety | |
self.academic_transitions = { | |
'addition': [ | |
"Moreover,", "Additionally,", "Furthermore,", "In addition,", | |
"What's more,", "Beyond that,", "On top of that,", "Also worth noting," | |
], | |
'contrast': [ | |
"However,", "Nevertheless,", "Nonetheless,", "Conversely,", | |
"On the contrary,", "In contrast,", "That said,", "Yet," | |
], | |
'causation': [ | |
"Therefore,", "Consequently,", "Thus,", "Hence,", | |
"As a result,", "This leads to,", "It follows that,", "Accordingly," | |
], | |
'emphasis': [ | |
"Notably,", "Significantly,", "Importantly,", "Remarkably,", | |
"It's worth emphasizing,", "Particularly noteworthy,", "Crucially,", "Indeed," | |
], | |
'sequence': [ | |
"Subsequently,", "Following this,", "Thereafter,", "Next,", | |
"In the next phase,", "Moving forward,", "Then,", "Later on," | |
] | |
} | |
# Comprehensive contraction mapping | |
self.contraction_map = { | |
"n't": " not", "'re": " are", "'s": " is", "'ll": " will", | |
"'ve": " have", "'d": " would", "'m": " am", "'t": " not", | |
"won't": "will not", "can't": "cannot", "shouldn't": "should not", | |
"wouldn't": "would not", "couldn't": "could not", "mustn't": "must not", | |
"isn't": "is not", "aren't": "are not", "wasn't": "was not", | |
"weren't": "were not", "haven't": "have not", "hasn't": "has not", | |
"hadn't": "had not", "doesn't": "does not", "didn't": "did not", | |
"don't": "do not", "let's": "let us", "that's": "that is", | |
"there's": "there is", "here's": "here is", "what's": "what is", | |
"where's": "where is", "who's": "who is", "it's": "it is" | |
} | |
def humanize_text( | |
self, | |
text: str, | |
use_passive: bool = False, | |
use_synonyms: bool = False, | |
use_paraphrasing: bool = False, | |
preserve_paragraphs: bool = True | |
) -> str: | |
""" | |
Advanced text humanization with state-of-the-art techniques. | |
""" | |
if not text or not text.strip(): | |
return text | |
try: | |
if self.preserve_formatting: | |
return self._humanize_with_advanced_preservation( | |
text, use_passive, use_synonyms, use_paraphrasing, preserve_paragraphs | |
) | |
else: | |
return self._humanize_advanced_simple(text, use_passive, use_synonyms, use_paraphrasing) | |
except Exception as e: | |
logger.error(f"Error during advanced text humanization: {e}") | |
return text | |
def _humanize_with_advanced_preservation( | |
self, | |
text: str, | |
use_passive: bool, | |
use_synonyms: bool, | |
use_paraphrasing: bool, | |
preserve_paragraphs: bool | |
) -> str: | |
"""Advanced humanization with comprehensive formatting preservation.""" | |
segments = self.markdown_preserver.segment_text(text) | |
for segment in segments: | |
if segment.segment_type == 'text' and segment.content.strip(): | |
# Apply AI detection avoidance if needed | |
if self.ai_avoidance_mode and segment.ai_probability > 0.6: | |
segment.content = self._apply_ai_avoidance_techniques( | |
segment.content, use_passive, use_synonyms, use_paraphrasing | |
) | |
else: | |
segment.content = self._transform_text_segment_advanced( | |
segment.content, use_passive, use_synonyms, use_paraphrasing | |
) | |
return self.markdown_preserver.reconstruct_text(segments) | |
def _apply_ai_avoidance_techniques( | |
self, | |
text: str, | |
use_passive: bool, | |
use_synonyms: bool, | |
use_paraphrasing: bool | |
) -> str: | |
"""Apply specialized techniques to avoid AI detection.""" | |
try: | |
# 1. Add natural imperfections | |
text = self._add_natural_variations(text) | |
# 2. Increase sentence variety | |
text = self._vary_sentence_structure(text) | |
# 3. Reduce formal language density | |
text = self._reduce_formality(text) | |
# 4. Apply standard transformations | |
text = self._transform_text_segment_advanced( | |
text, use_passive, use_synonyms, use_paraphrasing | |
) | |
return text | |
except Exception as e: | |
logger.warning(f"Error in AI avoidance: {e}") | |
return text | |
def _add_natural_variations(self, text: str) -> str: | |
"""Add natural human-like variations.""" | |
# Add occasional contractions to balance formality | |
if random.random() < 0.3: | |
formal_replacements = { | |
"do not": "don't", "will not": "won't", "cannot": "can't", | |
"should not": "shouldn't", "would not": "wouldn't" | |
} | |
for formal, contraction in formal_replacements.items(): | |
if formal in text and random.random() < 0.4: | |
text = text.replace(formal, contraction, 1) | |
return text | |
def _vary_sentence_structure(self, text: str) -> str: | |
"""Increase sentence structure variety.""" | |
sentences = sent_tokenize(text) | |
if len(sentences) < 2: | |
return text | |
varied_sentences = [] | |
for i, sentence in enumerate(sentences): | |
if i > 0 and random.random() < 0.3: | |
# Occasionally start with different structures | |
starters = ["Well,", "Actually,", "Interestingly,", "To be clear,"] | |
if not any(sentence.startswith(starter) for starter in starters): | |
starter = random.choice(starters) | |
sentence = f"{starter} {sentence.lower()}" | |
varied_sentences.append(sentence) | |
return ' '.join(varied_sentences) | |
def _reduce_formality(self, text: str) -> str: | |
"""Reduce excessive formality to appear more human.""" | |
# Replace overly formal words with more natural alternatives | |
formal_to_natural = { | |
'utilize': 'use', 'facilitate': 'help', 'demonstrate': 'show', | |
'implement': 'put in place', 'comprehensive': 'complete', | |
'methodology': 'method', 'substantial': 'large', | |
'numerous': 'many', 'acquire': 'get' | |
} | |
for formal, natural in formal_to_natural.items(): | |
if formal in text.lower() and random.random() < 0.6: | |
text = re.sub(r'\b' + formal + r'\b', natural, text, flags=re.IGNORECASE) | |
return text | |
def _transform_text_segment_advanced( | |
self, | |
text: str, | |
use_passive: bool, | |
use_synonyms: bool, | |
use_paraphrasing: bool | |
) -> str: | |
"""Advanced text segment transformation with ML models.""" | |
try: | |
doc = self.nlp(text) | |
transformed_sentences = [] | |
for sent in doc.sents: | |
sentence_str = sent.text.strip() | |
if not sentence_str: | |
continue | |
# 1. Expand contractions | |
sentence_str = self.expand_contractions_advanced(sentence_str) | |
# 2. Advanced paraphrasing (new!) | |
if use_paraphrasing and random.random() < self.p_paraphrase: | |
paraphrased = self.advanced_humanizer.paraphrase_sentence(sentence_str) | |
if paraphrased != sentence_str: | |
sentence_str = paraphrased | |
# 3. Context-aware academic transitions | |
if random.random() < self.p_academic_transition: | |
sentence_str = self.add_contextual_transitions(sentence_str) | |
# 4. Advanced passive voice conversion | |
if use_passive and random.random() < self.p_passive: | |
sentence_str = self.convert_to_passive_advanced(sentence_str) | |
# 5. Enhanced contextual synonym replacement | |
if use_synonyms and random.random() < self.p_synonym_replacement: | |
sentence_str = self.enhance_with_advanced_synonyms(sentence_str) | |
transformed_sentences.append(sentence_str) | |
result = ' '.join(transformed_sentences) | |
return result if result.strip() else text | |
except Exception as e: | |
logger.warning(f"Error in advanced transformation: {e}") | |
return text | |
def expand_contractions_advanced(self, sentence: str) -> str: | |
"""Enhanced contraction expansion with better context handling.""" | |
# Handle special cases with regex for better accuracy | |
for contraction, expansion in self.contraction_map.items(): | |
if len(contraction) > 3: # Full word contractions | |
pattern = r'\b' + re.escape(contraction) + r'\b' | |
sentence = re.sub(pattern, expansion, sentence, flags=re.IGNORECASE) | |
# Handle suffix contractions | |
tokens = word_tokenize(sentence) | |
expanded_tokens = [] | |
for token in tokens: | |
original_case = token | |
lower_token = token.lower() | |
replaced = False | |
for contraction, expansion in self.contraction_map.items(): | |
if (len(contraction) <= 3 and | |
lower_token.endswith(contraction) and | |
len(lower_token) > len(contraction)): | |
base = lower_token[:-len(contraction)] | |
new_token = base + expansion | |
# Preserve capitalization pattern | |
if original_case[0].isupper(): | |
new_token = new_token[0].upper() + new_token[1:] | |
expanded_tokens.append(new_token) | |
replaced = True | |
break | |
if not replaced: | |
expanded_tokens.append(token) | |
return ' '.join(expanded_tokens) | |
def add_contextual_transitions(self, sentence: str) -> str: | |
"""Add contextually intelligent academic transitions.""" | |
sentence_lower = sentence.lower() | |
# Enhanced context detection | |
context_patterns = { | |
'contrast': ['but', 'however', 'although', 'while', 'despite', 'whereas'], | |
'causation': ['because', 'since', 'therefore', 'so', 'due to', 'as a result'], | |
'addition': ['also', 'and', 'plus', 'including', 'along with'], | |
'emphasis': ['important', 'significant', 'notable', 'crucial', 'key'], | |
'sequence': ['first', 'second', 'then', 'next', 'finally', 'last'] | |
} | |
# Determine best transition type | |
best_type = 'addition' # default | |
max_matches = 0 | |
for transition_type, patterns in context_patterns.items(): | |
matches = sum(1 for pattern in patterns if pattern in sentence_lower) | |
if matches > max_matches: | |
max_matches = matches | |
best_type = transition_type | |
# Select appropriate transition | |
transition = random.choice(self.academic_transitions[best_type]) | |
return f"{transition} {sentence}" | |
def convert_to_passive_advanced(self, sentence: str) -> str: | |
"""Advanced passive voice conversion with better grammatical accuracy.""" | |
try: | |
doc = self.nlp(sentence) | |
# Find suitable active voice patterns | |
for token in doc: | |
if (token.pos_ == 'VERB' and | |
token.dep_ == 'ROOT' and | |
token.tag_ in ['VBD', 'VBZ', 'VBP']): | |
# Find subject and object | |
subj = None | |
obj = None | |
for child in token.children: | |
if child.dep_ == 'nsubj': | |
subj = child | |
elif child.dep_ in ['dobj', 'pobj']: | |
obj = child | |
if subj and obj: | |
# Create passive transformation | |
verb_base = token.lemma_ | |
# Choose auxiliary verb | |
aux = 'was' if subj.tag_ in ['NN', 'NNP'] else 'were' | |
if token.tag_ in ['VBZ', 'VBP']: # Present tense | |
aux = 'is' if subj.tag_ in ['NN', 'NNP'] else 'are' | |
# Create past participle | |
if verb_base.endswith('e'): | |
past_participle = verb_base + 'd' | |
elif verb_base in ['go', 'do', 'be', 'have']: | |
# Irregular verbs | |
irregular_map = {'go': 'gone', 'do': 'done', 'be': 'been', 'have': 'had'} | |
past_participle = irregular_map.get(verb_base, verb_base + 'ed') | |
else: | |
past_participle = verb_base + 'ed' | |
# Construct passive sentence | |
passive_phrase = f"{obj.text} {aux} {past_participle} by {subj.text}" | |
# Replace in original sentence | |
original_phrase = f"{subj.text} {token.text} {obj.text}" | |
if original_phrase in sentence: | |
return sentence.replace(original_phrase, passive_phrase) | |
return sentence | |
except Exception as e: | |
logger.warning(f"Error in advanced passive conversion: {e}") | |
return sentence | |
def get_advanced_transformation_stats(self, original_text: str, transformed_text: str) -> Dict[str, Union[int, float]]: | |
"""Get comprehensive transformation statistics with ML analysis.""" | |
orig_tokens = word_tokenize(original_text) | |
trans_tokens = word_tokenize(transformed_text) | |
orig_sents = sent_tokenize(original_text) | |
trans_sents = sent_tokenize(transformed_text) | |
# Calculate advanced metrics | |
stats = { | |
'original_word_count': len(orig_tokens), | |
'transformed_word_count': len(trans_tokens), | |
'original_sentence_count': len(orig_sents), | |
'transformed_sentence_count': len(trans_sents), | |
'word_change_ratio': len(trans_tokens) / len(orig_tokens) if orig_tokens else 0, | |
'sentence_change_ratio': len(trans_sents) / len(orig_sents) if orig_sents else 0, | |
'character_count_original': len(original_text), | |
'character_count_transformed': len(transformed_text), | |
} | |
# Add ML-based analysis | |
try: | |
# Semantic similarity | |
if hasattr(self, 'advanced_humanizer') and self.advanced_humanizer.sentence_model: | |
embeddings = self.advanced_humanizer.sentence_model.encode([original_text, transformed_text]) | |
semantic_similarity = float(util.cos_sim(embeddings[0], embeddings[1]).item()) | |
stats['semantic_similarity'] = semantic_similarity | |
# AI detection metrics | |
original_segments = self.markdown_preserver.segment_text(original_text) | |
transformed_segments = self.markdown_preserver.segment_text(transformed_text) | |
orig_ai_scores = [seg.ai_probability for seg in original_segments if seg.segment_type == 'text'] | |
trans_ai_scores = [seg.ai_probability for seg in transformed_segments if seg.segment_type == 'text'] | |
if orig_ai_scores and trans_ai_scores: | |
stats['original_ai_probability'] = np.mean(orig_ai_scores) | |
stats['transformed_ai_probability'] = np.mean(trans_ai_scores) | |
stats['ai_detection_improvement'] = stats['original_ai_probability'] - stats['transformed_ai_probability'] | |
except Exception as e: | |
logger.warning(f"Error calculating advanced stats: {e}") | |
return stats | |
def _humanize_advanced_simple(self, text: str, use_passive: bool, use_synonyms: bool, use_paraphrasing: bool) -> str: | |
"""Simple advanced transformation without formatting preservation.""" | |
paragraphs = text.split('\n\n') | |
transformed_paragraphs = [] | |
for paragraph in paragraphs: | |
if paragraph.strip(): | |
transformed = self._transform_text_segment_advanced( | |
paragraph, use_passive, use_synonyms, use_paraphrasing | |
) | |
transformed_paragraphs.append(transformed) | |
else: | |
transformed_paragraphs.append(paragraph) | |
return '\n\n'.join(transformed_paragraphs) |