Spaces:

isl-research
/

ddi-courses-discovery

Sleeping

ddi-courses-discovery / DDICourseSearch.py

Kaung Myat Htet

inialize project

118fce5 3 months ago

7.16 kB

	from typing import List, Dict, Any
	import numpy as np
	import pandas as pd
	from annoy import AnnoyIndex
	from sentence_transformers import SentenceTransformer
	from flashrank import Ranker, RerankRequest
	from dataclasses import dataclass

	@dataclass
	class CourseSearchResult:
	course_code: str
	course_title: str
	description: str
	credits: float
	category: str
	department: str
	track: str
	prerequisite: str
	remark: str
	score: float
	rerank_score: float = None

	class DDICourseSearch:
	def __init__(self, courses_df, ann_file: str = 'ddi_courses_index.ann', bi_encoder_name: str = 'all-MiniLM-L6-v2',
	flash_rank_name: str = "claudecc/flash-rank-reranker",
	embedding_dim: int = 384):

	self.courses_df = courses_df
	self.embedding_dim = embedding_dim
	self.bi_encoder = SentenceTransformer(bi_encoder_name)
	self.index = AnnoyIndex(embedding_dim, 'angular')
	self.index_built = False
	self.index.load(ann_file)
	# Initialize FlashRank reranker
	self.reranker = Ranker(max_length=128)

	def _create_search_text(self, row: pd.Series) -> str:
	"""Create search text from course data."""
	components = [
	str(row['Course Code']),
	str(row['Course Title']),
	str(row['Course Description']),
	str(row['Track']),
	str(row['Category'])
	]
	text = ' '.join(str(comp) for comp in components if pd.notna(comp))
	return text.replace('nan', '').strip()

	def search(self,
	query: str,
	k: int = 5,
	search_type: str = 'hybrid',
	alpha: float = 0.7,
	rerank: bool = True,
	rerank_cutoff: int = 100) -> List[CourseSearchResult]:
	"""
	Search for courses using the specified method.

	Args:
	query: Search query
	k: Number of final results to return
	search_type: One of 'semantic', 'keyword', or 'hybrid'
	alpha: Weight for semantic search in hybrid mode
	rerank: Whether to apply FlashRank reranking
	rerank_cutoff: Number of initial results to rerank

	Returns:
	List of CourseSearchResult objects
	"""
	# Get initial results
	if search_type == 'semantic':
	results = self._semantic_search(query, rerank_cutoff if rerank else k)
	elif search_type == 'keyword':
	results = self._keyword_search(query, rerank_cutoff if rerank else k)
	else:
	results = self._hybrid_search(query, rerank_cutoff if rerank else k, alpha)

	# Apply FlashRank reranking if requested
	if rerank:
	results = self._rerank_results(query, results, k)

	return results[:k]

	def _rerank_results(self, query: str, results: List[CourseSearchResult], k: int) -> List[CourseSearchResult]:
	"""Rerank results using FlashRank."""
	# Prepare texts for reranking
	texts = []
	for result in results:
	text = self._create_search_text(pd.Series({
	'Course Code': result.course_code,
	'Course Title': result.course_title,
	'Course Description': result.description,
	'Track': result.track,
	'Category': result.category
	}))
	texts.append({'text': text})

	# Get reranking scores
	rerankrequest = RerankRequest(query=query, passages=texts)
	rerank_scores = self.reranker.rerank(rerankrequest)
	rerank_scores = [item['score'] for item in rerank_scores]

	for result, score in zip(results, rerank_scores):
	result.rerank_score = float(score)

	# Sort by rerank score
	results.sort(key=lambda x: x.rerank_score, reverse=True)

	return results

	def _semantic_search(self, query: str, k: int) -> List[CourseSearchResult]:
	"""Perform semantic search."""
	query_embedding = self.bi_encoder.encode(query)
	indices, distances = self.index.get_nns_by_vector(
	query_embedding, k, include_distances=True
	)

	# Convert distances to similarities
	similarities = [1 - (distance ** 2) / 2 for distance in distances]

	return [self._create_result(idx, sim) for idx, sim in zip(indices, similarities)]

	def _keyword_search(self, query: str, k: int) -> List[CourseSearchResult]:
	"""Perform keyword-based search."""
	query_terms = set(query.lower().split())
	scores = []

	for idx, row in self.courses_df.iterrows():
	text = self._create_search_text(row).lower()
	text_terms = set(text.split())

	# Calculate TF score
	matches = len(query_terms.intersection(text_terms))
	score = matches / len(query_terms) if query_terms else 0
	scores.append((idx, score))

	# Sort and get top k
	scores.sort(key=lambda x: x[1], reverse=True)
	return [self._create_result(idx, score) for idx, score in scores[:k]]

	def _hybrid_search(self, query: str, k: int, alpha: float) -> List[CourseSearchResult]:
	"""Combine semantic and keyword search results."""
	semantic_results = self._semantic_search(query, k)
	keyword_results = self._keyword_search(query, k)

	# Combine scores
	combined_scores = {}
	for result in semantic_results:
	combined_scores[result.course_code] = alpha * result.score

	for result in keyword_results:
	if result.course_code in combined_scores:
	combined_scores[result.course_code] += (1 - alpha) * result.score
	else:
	combined_scores[result.course_code] = (1 - alpha) * result.score

	results = []
	for code, score in sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]:
	row = self.courses_df[self.courses_df['Course Code'] == code].iloc[0]
	results.append(self._create_result_from_row(row, score))

	return results

	def _create_result(self, idx: int, score: float) -> CourseSearchResult:
	"""Create a CourseSearchResult from index and score."""
	row = self.courses_df.iloc[idx]
	return self._create_result_from_row(row, score)

	def _create_result_from_row(self, row: pd.Series, score: float) -> CourseSearchResult:
	"""Create a CourseSearchResult from row and score."""
	return CourseSearchResult(
	course_code=row['Course Code'],
	course_title=row['Course Title'],
	description=str(row['Course Description']),
	credits=row['Credits'],
	category=str(row['Category']),
	department=str(row['Department']),
	track=str(row['Track']),
	prerequisite=str(row['Prerequisite']),
	remark=str(row['Remark']),
	score=score
	)