Spaces:

Akash4911
/

fakeshield-api

Running

App Files Files Community

fakeshield-api / app /models /structural_engine.py

Akash4911

Initial Deploy: FakeShield Backend v2.0 (Sovereign Vanguard)

89e8242 3 days ago

raw

history blame contribute delete

5.71 kB

	import spacy
	import numpy as np
	from typing import Dict, Any, List
	import re
	try:
	import textstat
	except ImportError:
	pass

	class StructuralEngine:
	"""
	Forensic Structural Engine v10.5 (Human-First Architecture)
	Analyzes Dependency Tree Entropy, Clause Complexity, and Structural Cadence.
	AI focuses on balanced 'clean' trees (high uniformity); Human text is 'lopsided' and irregular (structural strength).
	"""
	def __init__(self):
	try:
	self.nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "textcat"])
	self.enabled = True
	except Exception as e:
	print(f"[StructuralEngine] Error loading spaCy: {e}")
	self.enabled = False

	def analyze(self, text: str) -> Dict[str, Any]:
	if not self.enabled:
	return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"error": "spaCy not loaded"}}

	doc = self.nlp(text)
	sentences = list(doc.sents)
	if len(sentences) < 2:
	return {"structural_strength": 0.5, "high_uniformity": 0.5, "structural_variation": 0.5, "details": {"warning": "Too few sentences for structural analysis"}}

	tree_depths = []
	clause_ratios = []
	branching_factors = []
	punct_counts = []
	lengths = []

	for sent in sentences:
	depths = self._get_token_depths(sent.root)
	tree_depths.append(max(depths) if depths else 0)

	sub_clauses = sum(1 for token in sent if token.dep_ in ("advcl", "relcl", "ccomp", "xcomp"))
	clause_ratios.append(sub_clauses / (len(sent) + 1e-9))

	branches = [len(list(token.children)) for token in sent if len(list(token.children)) > 0]
	branching_factors.append(np.mean(branches) if branches else 0)

	punct_counts.append(sum(1 for token in sent if token.is_punct))
	lengths.append(len(sent))

	# --- NEW HUMAN-CENTRIC METRICS ---

	# A. Depth Variance
	depth_var = float(np.var(tree_depths))
	depth_var_norm = float(np.clip(depth_var / 5.0, 0.0, 1.0)) # Higher is more human

	# B. Punctuation Randomness
	punct_cv = float(np.std(punct_counts) / (np.mean(punct_counts) + 1e-9))
	punct_cv_norm = float(np.clip(punct_cv / 1.5, 0.0, 1.0)) # Higher is more human

	# C. Cadence (Sentence Length Variation)
	len_cv = float(np.std(lengths) / (np.mean(lengths) + 1e-9))
	len_cv_norm = float(np.clip(len_cv / 0.8, 0.0, 1.0)) # Higher is more human

	# D. Readability Entropy (via textstat)
	try:
	syllable_count = textstat.syllable_count(text)
	flesch = textstat.flesch_reading_ease(text)
	readability_complexity = float(np.clip(1.0 - (flesch / 100.0), 0.0, 1.0))
	except:
	readability_complexity = 0.5

	# E. Type-Token Ratio (Lexical Diversity)
	words = [t.text.lower() for t in doc if not t.is_punct]
	if words:
	ttr = len(set(words)) / (len(words) + 1e-9)
	ttr_norm = float(np.clip(ttr / 0.8, 0.0, 1.0)) # Higher is more human/diverse
	else:
	ttr = 0.5
	ttr_norm = 0.5

	# F. POS Entropy
	pos_counts = {}
	for t in doc:
	pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
	pos_total = sum(pos_counts.values())
	pos_probs = [c / pos_total for c in pos_counts.values()]
	pos_entropy = -sum(p * np.log2(p) for p in pos_probs if p > 0)
	# Normalize: AI typically has lower POS entropy (more predictable structure)
	pos_entropy_norm = float(np.clip((pos_entropy - 2.0) / 1.5, 0.0, 1.0))

	# G. Structural Entropy (Uniformity of branching)
	flat_branches = [b for b in branching_factors if b > 0]
	if flat_branches:
	hist, _ = np.histogram(flat_branches, bins=5, range=(0, 5))
	probs = hist / (sum(hist) + 1e-9)
	entropy = -sum(p * np.log2(p + 1e-9) for p in probs)
	# Normalize: AI typically has entropy < 1.0; Human > 1.5
	entropy_norm = float(np.clip((1.5 - entropy) / 1.0, 0.0, 1.0)) # Higher means strictly organized (AI)
	else:
	entropy_norm = 0.5

	# Final Aggregates
	# v16.5 weighting: focus on cadence, entropy, and diversity
	structural_strength = (depth_var_norm * 0.25) + (punct_cv_norm * 0.2) + (len_cv_norm * 0.25) + (ttr_norm * 0.2) + (pos_entropy_norm * 0.1)
	high_uniformity = (1.0 - len_cv_norm) * 0.3 + (1.0 - punct_cv_norm) * 0.2 + (entropy_norm * 0.3) + (1.0 - ttr_norm) * 0.2
	structural_variation = (depth_var_norm + len_cv_norm + ttr_norm) / 3.0

	# We value Entropy and Depth Var most for 2026-level detection
	return {
	"structural_strength": round(float(structural_strength), 4),
	"high_uniformity": round(float(high_uniformity), 4),
	"structural_variation": round(float(structural_variation), 4),
	"details": {
	"avg_depth": round(float(np.mean(tree_depths)), 2),
	"depth_variance": round(depth_var, 3),
	"sentence_cadence_cv": round(len_cv, 3),
	"punctuation_randomness": round(punct_cv, 3),
	"ttr": round(ttr, 3),
	"pos_entropy": round(pos_entropy, 3)
	}
	}

	def _get_token_depths(self, token, depth=0):
	depths = [depth]
	for child in token.children:
	depths.extend(self._get_token_depths(child, depth + 1))
	return depths