Spaces:

Asmitha-28
/

SupportMind

Running

App Files Files Community

SupportMind / src /historical_memory.py

Asmitha-28

Upload src/historical_memory.py with huggingface_hub

6b62dbe verified 2 days ago

raw

history blame contribute delete

4.67 kB

	import os
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.neighbors import NearestNeighbors
	import logging
	from typing import List, Dict

	logger = logging.getLogger(__name__)

	class HistoricalMemoryLayer:
	"""
	Historical Memory Layer using Retrieval-Augmented Generation (RAG) concepts.
	Stores successfully resolved past tickets.
	When a new ambiguous ticket arrives, it retrieves the K nearest historical tickets.
	This can be used to dynamically boost confidence or suggest resolutions.
	"""
	def __init__(self, data_path: str = None):
	if data_path is None:
	base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	data_path = os.path.join(base, 'data', 'processed', 'train.csv')

	self.data_path = data_path
	self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
	self.nn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
	self.memory_df = None
	self.is_ready = False

	self._load_memory()

	def _load_memory(self):
	try:
	if not os.path.exists(self.data_path):
	logger.warning(f"[HistoricalMemory] Data file not found at {self.data_path}")
	return

	self.memory_df = pd.read_csv(self.data_path)

	# Ensure required columns exist
	if 'text' not in self.memory_df.columns or 'category' not in self.memory_df.columns:
	logger.warning("[HistoricalMemory] Required columns ('text', 'category') missing.")
	return

	# Fit TF-IDF and Nearest Neighbors
	logger.info(f"[HistoricalMemory] Indexing {len(self.memory_df)} historical tickets...")
	X = self.vectorizer.fit_transform(self.memory_df['text'].fillna(''))
	self.nn_model.fit(X)

	self.is_ready = True
	logger.info("[HistoricalMemory] Indexing complete.")

	except Exception as e:
	logger.error(f"[HistoricalMemory] Failed to load memory: {e}")

	def retrieve_similar(self, query_text: str, k: int = 3) -> List[Dict]:
	"""
	Retrieve top K similar historical tickets.
	"""
	if not self.is_ready:
	return []

	# Vectorize query
	X_query = self.vectorizer.transform([query_text])

	# Search
	distances, indices = self.nn_model.kneighbors(X_query, n_neighbors=k)

	results = []
	for dist, idx in zip(distances[0], indices[0]):
	# Cosine distance to similarity score
	similarity = 1.0 - dist
	row = self.memory_df.iloc[idx]

	results.append({
	'text': row['text'],
	'category': row['category'],
	'similarity': round(similarity, 4)
	})

	return results

	def compute_historical_boost(self, query_text: str, candidate_category: str, k: int = 5) -> float:
	"""
	Calculate a confidence boost if the most similar past tickets
	were resolved in the same candidate category.
	"""
	if not self.is_ready:
	return 0.0

	similar_tickets = self.retrieve_similar(query_text, k=k)
	if not similar_tickets:
	return 0.0

	# Count how many of the top-k match the candidate category, weighted by similarity
	boost = 0.0
	total_weight = 0.0

	for t in similar_tickets:
	weight = t['similarity']
	total_weight += weight
	if t['category'] == candidate_category:
	boost += weight

	if total_weight == 0:
	return 0.0

	match_ratio = boost / total_weight

	# Max boost is 0.15 (15%)
	return round(match_ratio * 0.15, 4)

	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO)
	memory = HistoricalMemoryLayer()

	test_queries = [
	"My invoice from last month is incorrect, please fix the billing.",
	"The API keeps returning 500 errors since last Tuesday's update.",
	"How do I add another user to our account?"
	]

	for q in test_queries:
	print(f"\nQuery: '{q}'")
	results = memory.retrieve_similar(q, k=2)
	for r in results:
	print(f" -> [{r['category']}] (sim: {r['similarity']:.2f}) {r['text']}")

	boost = memory.compute_historical_boost(q, "billing")
	print(f"Historical boost for 'billing': +{boost}")