Spaces:
Running
Running
| import spacy | |
| import numpy as np | |
| from typing import Dict, Any, List | |
| import re | |
| try: | |
| import textstat | |
| except ImportError: | |
| pass | |
| class StructuralEngine: | |
| """ | |
| Forensic Structural Engine v10.5 (Human-First Architecture) | |
| Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence. | |
| AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength). | |
| """ | |
| def __init__(self): | |
| try: | |
| self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"]) | |
| self.enabled = True | |
| except Exception as e: | |
| print(f"[StructuralEngine] Error loading spaCy: {e}") | |
| self.enabled = False | |
| def analyze(self, text: str) -> Dict[str, Any]: | |
| if not self.enabled: | |
| return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}} | |
| doc = self.nlp(text) | |
| sentences = list(doc.sents) | |
| if len(sentences) < 2: | |
| return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}} | |
| tree_depths = [] | |
| clause_ratios = [] | |
| branching_factors = [] | |
| punct_counts = [] | |
| lengths = [] | |
| for sent in sentences: | |
| depths = self._get_token_depths(sent.root) | |
| tree_depths.append(max(depths) if depths else 0) | |
| sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp")) | |
| clause_ratios.append(sub_clauses / (len(sent) + 1e-9)) | |
| branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0] | |
| branching_factors.append(np.mean(branches) if branches else 0) | |
| punct_counts.append(sum(1 for token in sent if token.is_punct)) | |
| lengths.append(len(sent)) | |
| # --- NEW HUMAN-CENTRIC METRICS --- | |
| # A. Depth Variance | |
| depth_var = float(np.var(tree_depths)) | |
| depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human | |
| # B. Punctuation Randomness | |
| punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9)) | |
| punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human | |
| # C. Cadence (Sentence Length Variation) | |
| len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9)) | |
| len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human | |
| # D. Readability Entropy (via textstat) | |
| try: | |
| syllable_count = textstat.syllable_count(text) | |
| flesch = textstat.flesch_reading_ease(text) | |
| readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0)) | |
| except: | |
| readability_complexity = 0.5 | |
| # E. Type-Token Ratio (Lexical Diversity) | |
| words = [t.text.lower() for t in doc if not t.is_punct] | |
| if words: | |
| ttr = len(set(words)) / (len(words) + 1e-9) | |
| ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse | |
| else: | |
| ttr = 0.5 | |
| ttr_norm = 0.5 | |
| # F. POS Entropy | |
| pos_counts = {} | |
| for t in doc: | |
| pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1 | |
| pos_total = sum(pos_counts.values()) | |
| pos_probs = [c / pos_total for c in pos_counts.values()] | |
| pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0) | |
| # Normalize: AI typically has lower POS entropy (more predictable structure) | |
| pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0)) | |
| # G. Structural Entropy (Uniformity of branching) | |
| flat_branches = [b for b in branching_factors if b > 0] | |
| if flat_branches: | |
| hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5)) | |
| probs = hist / (sum(hist) + 1e-9) | |
| entropy = -sum(p * np.log2(p + 1e-9) for p in probs) | |
| # Normalize: AI typically has entropy < 1.0; Human > 1.5 | |
| entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0)) # Higher means strictly organized (AI) | |
| else: | |
| entropy_norm = 0.5 | |
| # Final Aggregates | |
| # v16.5 weighting: focus on cadence, entropy, and diversity | |
| structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1) | |
| high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2 | |
| structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0 | |
| # We value Entropy and Depth Var most for 2026-level detection | |
| return { | |
| "structural_strength": round(float(structural_strength), 4), | |
| "high_uniformity": round(float(high_uniformity), 4), | |
| "structural_variation": round(float(structural_variation), 4), | |
| "details": { | |
| "avg_depth": round(float(np.mean(tree_depths)), 2), | |
| "depth_variance": round(depth_var, 3), | |
| "sentence_cadence_cv": round(len_cv, 3), | |
| "punctuation_randomness": round(punct_cv, 3), | |
| "ttr": round(ttr, 3), | |
| "pos_entropy": round(pos_entropy, 3) | |
| } | |
| } | |
| def _get_token_depths(self, token, depth=0): | |
| depths = [depth] | |
| for child in token.children: | |
| depths.extend(self._get_token_depths(child, depth + 1)) | |
| return depths | |