Spaces:

EdwardSamuel13
/

business-analytics-dashboard

Sleeping

App Files Files Community

business-analytics-dashboard / src /rag_engine.py

EdwardSamuel13

Upload 14 files

8f69dec verified 3 months ago

raw

history blame contribute delete

4.38 kB

	import os
	import pandas as pd
	import json
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from transformers import pipeline

	class RAGSystem:
	def __init__(self, data_dir):
	self.data_dir = data_dir
	self.vector_store = None
	self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	self.llm_pipeline = None

	def load_documents(self):
	docs = []

	# 1. Load Reviews (Unstructured)
	reviews_path = os.path.join(self.data_dir, 'customer_reviews.csv')
	if os.path.exists(reviews_path):
	df_reviews = pd.read_csv(reviews_path)
	for _, row in df_reviews.iterrows():
	content = f"Product: {row.get('Product', 'Unknown')}\nDate: {row.get('Date', '')}\nRating: {row.get('Rating', '')}\nReview: {row.get('ReviewText', '')}"
	metadata = {"source": "customer_reviews", "product": row.get('Product', 'Unknown')}
	docs.append(Document(page_content=content, metadata=metadata))

	# 2. Load Web Logs (Semi-structured)
	logs_path = os.path.join(self.data_dir, 'web_logs.json')
	if os.path.exists(logs_path):
	with open(logs_path, 'r') as f:
	logs_data = json.load(f)
	for log in logs_data:
	content = f"Log Timestamp: {log.get('timestamp', '')}\nAction: {log.get('action', '')}\nPage: {log.get('page', '')}\nUser: {log.get('user_id', '')}"
	metadata = {"source": "web_logs"}
	docs.append(Document(page_content=content, metadata=metadata))

	# 3. Load Sales Summary (Structured -> Text)
	sales_path = os.path.join(self.data_dir, 'sales_data.csv')
	if os.path.exists(sales_path):
	df_sales = pd.read_csv(sales_path)
	# Create a summary per product/region instead of every row to save tokens/index size
	summary = df_sales.groupby(['Product', 'Region'])['TotalPrice'].sum().reset_index()
	for _, row in summary.iterrows():
	content = f"Sales Summary:\nProduct: {row['Product']}\nRegion: {row['Region']}\nTotal Revenue: ${row['TotalPrice']:.2f}"
	metadata = {"source": "sales_summary"}
	docs.append(Document(page_content=content, metadata=metadata))

	return docs

	def build_index(self):
	docs = self.load_documents()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	splits = text_splitter.split_documents(docs)

	if splits:
	self.vector_store = FAISS.from_documents(splits, self.embeddings)
	return True
	return False

	def init_llm(self):
	# Initialize a small local model for generation
	# Using flan-t5-small as it is lightweight
	try:
	self.llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
	except Exception as e:
	print(f"Error loading LLM: {e}")
	self.llm_pipeline = None

	def query(self, user_query, k=3):
	if not self.vector_store:
	return {
	"answer": "System not initialized. Please build the index first.",
	"context": []
	}

	# Retrieve
	docs = self.vector_store.similarity_search(user_query, k=k)
	context_text = "\n\n".join([d.page_content for d in docs])

	# Generate
	if self.llm_pipeline:
	prompt = f"Summarize the following context to answer the question. \n\nContext:\n{context_text}\n\nQuestion: {user_query}\n\nAnswer:"
	# Truncate prompt if too long (simple heuristic)
	if len(prompt) > 2048:
	prompt = prompt[:2048]

	result = self.llm_pipeline(prompt, max_length=200, do_sample=False)
	answer = result[0]['generated_text']
	else:
	answer = "LLM not loaded. Displaying retrieved context only."

	return {
	"answer": answer,
	"context": docs
	}