Spaces:

agoor97
/

vartur-search-engine

Sleeping

App Files Files Community

vartur-search-engine / src /db_agent.py

agoor97

Add application files

9e22989 6 months ago

raw

history blame contribute delete

23.7 kB

	import chromadb
	import os, logging, re
	import pandas as pd
	from chromadb.utils import embedding_functions
	from typing import List, Dict, Union, Optional
	from datetime import datetime
	from src.utils.config import OPENAI_API_KEY, COLLECTION_NAME


	class VarturRealEstateSearch:
	def __init__(self, persist_directory: str="database"):
	"""Initialize the search system with OpenAI embeddings."""
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	if not os.path.exists(persist_directory):
	os.makedirs(persist_directory)

	self.client = chromadb.PersistentClient(path=persist_directory)
	self.embedding_function = embedding_functions.OpenAIEmbeddingFunction(
	api_key=OPENAI_API_KEY,
	model_name="text-embedding-3-small"
	)
	self._file_stats = {}
	self._collection_initialized = False # Flag to track collection initialization

	# call below function
	self._initialize_collection()

	def _initialize_collection(self):
	"""Initialize the collection if it hasn't been already."""
	if self._collection_initialized:
	return # Avoid redundant initialization

	try:
	self.collection = self.client.get_or_create_collection(
	name=COLLECTION_NAME,
	embedding_function=self.embedding_function
	)
	self._collection_initialized = True # Mark collection as initialized
	logging.info(f'Collection {COLLECTION_NAME} is ready')
	except Exception as e:
	logging.error(f"Error initializing collection: {str(e)}")
	raise

	def _normalize_text(self, text: str) -> str:
	"""Normalize any text input for consistent matching."""
	if not text:
	return ""
	if not isinstance(text, str):
	text = str(text)
	# Remove extra spaces and convert to lowercase
	return " ".join(text.lower().split())

	def _generate_unit_variations(self, unit_type: str) -> List[str]:
	"""Generate different variations of unit type for better matching."""
	variations = set()

	# Normalize the input
	unit = self._normalize_text(unit_type)
	variations.add(unit)

	# Handle bedroom variations
	bedroom_match = re.match(r'(\d+)\s*(?:bedroom\|br\|b/r\|bed)', unit)
	if bedroom_match:
	num = bedroom_match.group(1)
	variations.update([
	f"{num} bedroom",
	f"{num} br",
	f"{num}br",
	f"{num} bed",
	f"{num}bed",
	f"{num} b/r",
	f"{num}b/r",
	f"{num} bedrooms"
	])

	# Handle other common types
	type_variations = {
	'studio': ['studio', 'studio apartment', 'studio apt'],
	'penthouse': ['penthouse', 'ph', 'pent house'],
	'townhouse': ['townhouse', 'th', 'town house'],
	'villa': ['villa', 'vil'],
	'apartment': ['apartment', 'apt', 'appartment']
	}

	# Add variations for the unit type
	for base_type, vars in type_variations.items():
	if any(var in unit for var in vars):
	variations.update(vars)

	return list(variations)

	@staticmethod
	def clean_price(price: Union[str, float, int]) -> float:
	"""Clean price string and convert to float."""
	if isinstance(price, (int, float)):
	return float(price)

	if not isinstance(price, str):
	raise ValueError(f"Unsupported price type: {type(price)}")

	price = price.strip()
	if not price:
	raise ValueError("Empty price value")

	try:
	# Handle million notation (e.g., "1.5M" or "1.5 million")
	million_pattern = r'^(\d+\.?\d)\s[mM](?:illion)?$'
	match = re.match(million_pattern, price)
	if match:
	return float(match.group(1)) * 1_000_000

	# Remove any non-numeric characters except dots and commas
	price = ''.join(c for c in price if c.isdigit() or c == '.' or c == ',')
	# Remove commas and convert to float
	price = price.replace(',', '')
	return float(price)

	except Exception as e:
	raise ValueError(f"Failed to parse price '{price}': {str(e)}")

	def prepare_document(self, row: pd.Series) -> str:
	"""Create a searchable document from a row of data with normalized text."""
	try:
	price_value = self.clean_price(row['AskingPrice'])
	formatted_price = f"{price_value/1_000_000:.2f}M" if price_value >= 1_000_000 else f"{price_value:,.2f}"

	# Create alternative forms for unit type
	unit_type = str(row['UnitType'])
	unit_variations = self._generate_unit_variations(unit_type)

	# Normalize all fields
	developer = self._normalize_text(str(row['Developer']))
	view = self._normalize_text(str(row['View']))
	total_area = str(row['TotalArea']).replace(',', '')

	# Create searchable document with all variations
	document = (
	f"Unit {row['UnitCode']} is a {unit_type}. "
	f"Alternative names: {' '.join(unit_variations)}. "
	f"Located on floor {row['Floor']} and developed by {developer}. "
	f"Features a {view} view and total area of {total_area} square feet. "
	f"The asking price is AED {formatted_price}."
	)

	return document

	except Exception as e:
	logging.error(f"Error preparing document: {str(e)}, Row: {row.to_dict()}")
	raise

	def update_file_stats(self, filename: str, count: int):
	"""Update file statistics."""
	self._file_stats[filename] = {
	"count": count,
	"last_updated": datetime.now().isoformat()
	}

	def get_file_stats(self):
	"""Get current file statistics."""
	return self._file_stats

	def remove_file_stats(self, filename: str):
	"""Remove file statistics."""
	if filename in self._file_stats:
	del self._file_stats[filename]

	def load_data(self, csv_path: str, reset_collection: bool = False) -> Dict:
	"""Load and process data from CSV file."""
	try:
	filename = os.path.basename(csv_path)

	# Handle reset collection if needed
	if reset_collection:
	existing_ids = [id for id, meta in zip(self.collection.get().get("ids", []),
	self.collection.get().get("metadatas", []))
	if meta.get("source_file") == filename]
	if existing_ids:
	self.collection.delete(ids=existing_ids)
	self.remove_file_stats(filename)

	# Read and process CSV
	df = pd.read_csv(csv_path)
	documents = []
	metadatas = []
	ids = []

	for idx, row in df.iterrows():
	try:
	doc = self.prepare_document(row)

	# Clean and prepare metadata
	metadata = {
	"unit_code": str(row["UnitCode"]),
	"unit_type": self._normalize_text(str(row["UnitType"])),
	"developer": self._normalize_text(str(row["Developer"])),
	"price": float(self.clean_price(row["AskingPrice"])),
	"total_area": float(str(row["TotalArea"]).replace(',', '')),
	"view": self._normalize_text(str(row["View"])),
	"floor": str(row["Floor"]),
	"source_file": filename,
	"indexed_date": datetime.now().isoformat()
	}

	documents.append(doc)
	metadatas.append(metadata)
	ids.append(str(row["UnitCode"]))

	except Exception as row_error:
	logging.warning(f"Error processing row {idx}: {str(row_error)}")
	continue

	if documents:
	# Add documents to collection
	self.collection.add(
	documents=documents,
	metadatas=metadatas,
	ids=ids
	)

	self.update_file_stats(filename, len(documents))

	return {
	"status": "success",
	"message": f"Added {len(documents)} properties from {filename}",
	"count": len(documents)
	}
	else:
	return {
	"status": "error",
	"message": "No valid data to add",
	"count": 0
	}

	except Exception as e:
	logging.error(f"Error loading data: {str(e)}")
	return {
	"status": "error",
	"message": str(e),
	"count": 0
	}

	def _clean_metadata(self, metadata: Dict) -> Dict:
	"""Clean and normalize metadata values."""
	return {
	'unit_code': str(metadata['unit_code']),
	'unit_type': self._normalize_text(metadata['unit_type']),
	'developer': self._normalize_text(metadata['developer']),
	'view': self._normalize_text(metadata['view']),
	'floor': str(metadata['floor']),
	'price': float(str(metadata['price']).replace(',', '')),
	'total_area': float(str(metadata['total_area']).replace(',', ''))
	}

	def _normalize_floor(self, floor_value: str) -> Optional[int]:
	"""Convert floor value to normalized integer for comparison."""
	if not floor_value:
	return None

	floor = str(floor_value).strip().upper()

	# Special cases
	floor_map = {
	'G': 0, # Ground
	'GR': 0, # Ground
	'LG': 0, # Lower Ground
	'UG': 0, # Upper Ground
	'B': -1, # Basement
	'B1': -1, # Basement 1
	'B2': -2, # Basement 2
	'B3': -3, # Basement 3
	'M': 1, # Mezzanine
	'P': 1, # Podium
	'P1': 1, # Podium 1
	'P2': 2, # Podium 2
	'P3': 3, # Podium 3
	'L': 1, # Lower level
	'PH': 999 # Penthouse (high value for sorting)
	}

	# Check exact matches
	if floor in floor_map:
	return floor_map[floor]

	# Check prefixes
	for prefix, value in floor_map.items():
	if floor.startswith(prefix):
	# Try to get number after prefix
	num_part = ''.join(filter(str.isdigit, floor[len(prefix):]))
	if num_part:
	if prefix in ['B', 'B1', 'B2', 'B3']:
	return -int(num_part) # Negative for basement levels
	return int(num_part)
	return value

	# Try to extract just numbers
	nums = ''.join(filter(str.isdigit, floor))
	if nums:
	return int(nums)

	return None

	def _in_range(self, value: float, min_val: float = None, max_val: float = None) -> bool:
	"""Check if value is within specified range."""
	if min_val is not None and value < min_val:
	return False
	if max_val is not None and value > max_val:
	return False
	return True

	def _generate_variations(self, text: str, property_type: str) -> List[str]:
	"""Generate variations based on property type."""
	if not text:
	return []

	variations = set([text.lower()])

	if property_type == "unit_type":
	# Handle bedroom variations
	bedroom_match = re.match(r'(\d+)\s*(?:bedroom\|br\|b/r\|bed)', text.lower())
	if bedroom_match:
	num = bedroom_match.group(1)
	variations.update([
	f"{num} bedroom",
	f"{num}bedroom",
	f"{num} br",
	f"{num}br",
	f"{num} b/r",
	f"{num}b/r",
	f"{num} bed",
	f"{num}bed"
	])

	# Handle special types
	type_map = {
	'studio': ['studio', 'studio apartment', 'studio apt'],
	'penthouse': ['penthouse', 'ph', 'pent house'],
	'townhouse': ['townhouse', 'th', 'town house'],
	'duplex': ['duplex', 'du', 'dup'],
	'villa': ['villa', 'vil', 'v'],
	'apartment': ['apartment', 'apt', 'flat']
	}

	for base_type, vars in type_map.items():
	if any(var in text.lower() for var in vars):
	variations.update(vars)

	elif property_type == "view":
	# Handle view variations
	view_map = {
	'sea': ['sea', 'ocean', 'water', 'beach'],
	'burj': ['burj', 'burj khalifa', 'downtown'],
	'garden': ['garden', 'park', 'landscape'],
	'marina': ['marina', 'yacht', 'boat'],
	'palm': ['palm', 'jumeirah', 'atlantis']
	}

	for base_view, vars in view_map.items():
	if any(var in text.lower() for var in vars):
	variations.update(vars)

	return list(variations)

	def _check_floor_range(self, floor: str, min_floor: str = None, max_floor: str = None) -> bool:
	"""Check if floor value is within specified range."""
	if min_floor is None and max_floor is None:
	return True

	floor_num = self._normalize_floor(floor)
	min_floor_num = self._normalize_floor(min_floor) if min_floor else None
	max_floor_num = self._normalize_floor(max_floor) if max_floor else None

	if floor_num is None: # If we can't parse the floor, don't filter it out
	return True

	if min_floor_num is not None and floor_num < min_floor_num:
	return False
	if max_floor_num is not None and floor_num > max_floor_num:
	return False

	return True

	def _calculate_similarity(self, query: str, metadata: Dict, base_score: float) -> float:
	"""Calculate enhanced similarity score."""
	score = base_score

	if not query:
	return score

	normalized_query = self._normalize_text(query)

	# Unit type matching
	if any(var in metadata['unit_type']
	for var in self._generate_unit_variations(normalized_query)):
	score *= 1.3 # 30% boost for unit type match

	# View matching
	if normalized_query in metadata['view']:
	score *= 1.2 # 20% boost for view match

	# Developer matching
	if normalized_query in metadata['developer']:
	score *= 1.15 # 15% boost for developer match

	return min(score, 1.0) # Cap at 1.0

	def search(self, query: str, n_results: int = 10) -> List[Dict]:
	"""Base vectorized search implementation."""
	try:
	if not self.collection.count():
	return []

	if not query or query.strip() == "*":
	return []

	# Process query with variations
	normalized_query = self._normalize_text(query)
	variations = self._generate_unit_variations(normalized_query)

	# Build semantic query with boosted relevant terms
	search_query = f"{normalized_query} {' '.join(variations)}"

	# Get raw results with buffer for better scoring
	buffer_multiplier = 2
	results = self.collection.query(
	query_texts=[search_query],
	n_results=min(n_results * buffer_multiplier, self.collection.count()),
	include=['metadatas', 'documents', 'distances']
	)

	if not results['ids'][0]:
	return []

	processed_results = []
	for i, doc_id in enumerate(results['ids'][0]):
	try:
	metadata = results['metadatas'][0][i]
	cleaned_metadata = self._clean_metadata(metadata)

	# Calculate enhanced similarity score
	base_score = 1 / (1 + float(results['distances'][0][i]))
	final_score = self._calculate_similarity(query, cleaned_metadata, base_score)

	processed_results.append({
	'id': doc_id,
	'document': results['documents'][0][i],
	'metadata': cleaned_metadata,
	'similarity': final_score
	})

	except Exception as e:
	logging.warning(f"Error processing result {doc_id}: {e}")
	continue

	# Sort by enhanced similarity score
	processed_results.sort(key=lambda x: x['similarity'], reverse=True)
	return processed_results[:n_results]

	except Exception as e:
	logging.error(f"Search error: {e}", exc_info=True)
	return []

	def filter_search(self,
	query: str = None,
	min_price: float = None,
	max_price: float = None,
	min_area: float = None,
	max_area: float = None,
	min_floor: str = None,
	max_floor: str = None,
	unit_type: str = None,
	developer: str = None,
	view: str = None,
	n_results: int = 10) -> List[Dict]:
	"""Combined search and filter implementation."""
	try:
	# Get base results
	if query:
	base_results = self.search(query, n_results=self.collection.count())
	else:
	results = self.collection.get(include=['metadatas', 'documents'])
	base_results = [{
	'id': id,
	'document': doc,
	'metadata': self._clean_metadata(meta),
	'similarity': 1.0
	} for id, doc, meta in zip(
	results.get('ids', []),
	results.get('documents', []),
	results.get('metadatas', [])
	)]

	if not base_results:
	return []

	# Apply filters with enhanced scoring
	filtered_results = []
	for result in base_results:
	try:
	metadata = result['metadata']
	score = result['similarity']

	# Basic filters
	if not all([
	self._in_range(metadata['price'], min_price, max_price),
	self._in_range(metadata['total_area'], min_area, max_area),
	self._check_floor_range(metadata['floor'], min_floor, max_floor)
	]):
	continue

	# Enhanced unit type matching
	if unit_type:
	unit_vars = self._generate_unit_variations(metadata['unit_type'])
	if any(unit_type.lower() == var.lower() for var in unit_vars):
	score *= 1.3 # Exact match
	elif any(unit_type.lower() in var.lower() for var in unit_vars):
	score *= 1.15 # Partial match
	else:
	continue

	# Enhanced developer matching
	if developer:
	if developer.lower() == metadata['developer'].lower():
	score *= 1.25 # Exact match
	elif developer.lower() in metadata['developer'].lower():
	score *= 1.1 # Partial match
	else:
	continue

	# Enhanced view matching
	if view:
	if view.lower() == metadata['view'].lower():
	score *= 1.25 # Exact match
	elif view.lower() in metadata['view'].lower():
	score *= 1.1 # Partial match
	else:
	continue

	result['similarity'] = min(score, 1.0) # Cap final score
	filtered_results.append(result)

	except Exception as e:
	logging.warning(f"Filter error for {result['id']}: {e}")
	continue

	# Sort by final similarity score
	filtered_results.sort(key=lambda x: x['similarity'], reverse=True)
	return filtered_results[:n_results]

	except Exception as e:
	logging.error(f"Filter search error: {e}", exc_info=True)
	return []

	def delete_properties(self, files: List[str] = None):
	"""Delete properties by source filename."""
	try:
	if files:
	for filename in files:
	results = self.collection.get()
	to_delete = [
	id for id, meta in zip(results.get("ids", []),
	results.get("metadatas", []))
	if meta.get("source_file") == filename
	]

	if to_delete:
	self.collection.delete(ids=to_delete)
	self.remove_file_stats(filename)
	logging.info(f"Deleted {len(to_delete)} properties from {filename}")
	else:
	logging.warning(f"No properties found for file {filename}")

	except Exception as e:
	logging.error(f"Error deleting properties: {str(e)}")

	def get_collection_stats(self) -> Dict:
	"""Get collection statistics."""
	try:
	total_properties = sum(stat["count"] for stat in self._file_stats.values())
	stats = {
	"total_properties": total_properties,
	"file_stats": self._file_stats,
	"collection_name": self.collection.name,
	"last_updated": max((stat["last_updated"] for stat in self._file_stats.values()),
	default=None) if self._file_stats else None
	}
	return stats

	except Exception as e:
	logging.error(f"Error getting collection stats: {str(e)}")
	return {"error": str(e)}