Spaces:

swathi-01
/

Adaptive-RAG-Router

Sleeping

swathi madhavan

Initial commit: Adaptive RAG Router with Landing Page and Dashboard

bf6bdf5 4 days ago

1.66 kB

	"""
	Hand-crafted structural features to supplement sentence embeddings.

	WHY THIS EXISTS: diagnostic testing (scripts/diagnose_router.py) showed
	SIMPLE_RAG and MULTI_HOP class centroids have 0.775 cosine similarity in
	embedding space -- by far the closest pair of any two classes (next
	highest is 0.246). Sentence embeddings capture topical similarity, not
	reasoning structure, so a question like "what is X" and "compare X and Y"
	about the same policy topic land very close together even though they
	need completely different retrieval strategies.

	These features give the classifier an explicit signal for reasoning
	structure that the embedding alone can't provide.
	"""
	import re

	import numpy as np

	COMPARISON_WORDS = [
	"compare", "comparison", "versus", " vs ", " vs.", "difference between",
	"differ", "interact", "relationship between", "combined effect",
	"contradict", "conflict",
	]

	MULTI_ENTITY_CONNECTORS = [
	" and ", " both ", " across ", " between ",
	]


	def extract_structural_features(query: str) -> np.ndarray:
	q = query.lower()

	has_comparison_word = float(any(word in q for word in COMPARISON_WORDS))
	has_multi_entity_connector = float(any(conn in q for conn in MULTI_ENTITY_CONNECTORS))
	question_mark_count = float(q.count("?"))
	word_count = float(len(q.split()))
	has_two_capitalized_terms = float(
	len(re.findall(r"\b[A-Z][a-zA-Z]+\b", query)) >= 2
	)

	return np.array([
	has_comparison_word,
	has_multi_entity_connector,
	word_count / 20.0, # normalize roughly to 0-1ish range
	has_two_capitalized_terms,
	], dtype=np.float32)


	NUM_STRUCTURAL_FEATURES = 4