ddi-courses-discovery / DDICourseSearch.py
Kaung Myat Htet
inialize project
118fce5
raw
history blame
7.16 kB
from typing import List, Dict, Any
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from flashrank import Ranker, RerankRequest
from dataclasses import dataclass
@dataclass
class CourseSearchResult:
course_code: str
course_title: str
description: str
credits: float
category: str
department: str
track: str
prerequisite: str
remark: str
score: float
rerank_score: float = None
class DDICourseSearch:
def __init__(self, courses_df, ann_file: str = 'ddi_courses_index.ann', bi_encoder_name: str = 'all-MiniLM-L6-v2',
flash_rank_name: str = "claudecc/flash-rank-reranker",
embedding_dim: int = 384):
self.courses_df = courses_df
self.embedding_dim = embedding_dim
self.bi_encoder = SentenceTransformer(bi_encoder_name)
self.index = AnnoyIndex(embedding_dim, 'angular')
self.index_built = False
self.index.load(ann_file)
# Initialize FlashRank reranker
self.reranker = Ranker(max_length=128)
def _create_search_text(self, row: pd.Series) -> str:
"""Create search text from course data."""
components = [
str(row['Course Code']),
str(row['Course Title']),
str(row['Course Description']),
str(row['Track']),
str(row['Category'])
]
text = ' '.join(str(comp) for comp in components if pd.notna(comp))
return text.replace('nan', '').strip()
def search(self,
query: str,
k: int = 5,
search_type: str = 'hybrid',
alpha: float = 0.7,
rerank: bool = True,
rerank_cutoff: int = 100) -> List[CourseSearchResult]:
"""
Search for courses using the specified method.
Args:
query: Search query
k: Number of final results to return
search_type: One of 'semantic', 'keyword', or 'hybrid'
alpha: Weight for semantic search in hybrid mode
rerank: Whether to apply FlashRank reranking
rerank_cutoff: Number of initial results to rerank
Returns:
List of CourseSearchResult objects
"""
# Get initial results
if search_type == 'semantic':
results = self._semantic_search(query, rerank_cutoff if rerank else k)
elif search_type == 'keyword':
results = self._keyword_search(query, rerank_cutoff if rerank else k)
else:
results = self._hybrid_search(query, rerank_cutoff if rerank else k, alpha)
# Apply FlashRank reranking if requested
if rerank:
results = self._rerank_results(query, results, k)
return results[:k]
def _rerank_results(self, query: str, results: List[CourseSearchResult], k: int) -> List[CourseSearchResult]:
"""Rerank results using FlashRank."""
# Prepare texts for reranking
texts = []
for result in results:
text = self._create_search_text(pd.Series({
'Course Code': result.course_code,
'Course Title': result.course_title,
'Course Description': result.description,
'Track': result.track,
'Category': result.category
}))
texts.append({'text': text})
# Get reranking scores
rerankrequest = RerankRequest(query=query, passages=texts)
rerank_scores = self.reranker.rerank(rerankrequest)
rerank_scores = [item['score'] for item in rerank_scores]
for result, score in zip(results, rerank_scores):
result.rerank_score = float(score)
# Sort by rerank score
results.sort(key=lambda x: x.rerank_score, reverse=True)
return results
def _semantic_search(self, query: str, k: int) -> List[CourseSearchResult]:
"""Perform semantic search."""
query_embedding = self.bi_encoder.encode(query)
indices, distances = self.index.get_nns_by_vector(
query_embedding, k, include_distances=True
)
# Convert distances to similarities
similarities = [1 - (distance ** 2) / 2 for distance in distances]
return [self._create_result(idx, sim) for idx, sim in zip(indices, similarities)]
def _keyword_search(self, query: str, k: int) -> List[CourseSearchResult]:
"""Perform keyword-based search."""
query_terms = set(query.lower().split())
scores = []
for idx, row in self.courses_df.iterrows():
text = self._create_search_text(row).lower()
text_terms = set(text.split())
# Calculate TF score
matches = len(query_terms.intersection(text_terms))
score = matches / len(query_terms) if query_terms else 0
scores.append((idx, score))
# Sort and get top k
scores.sort(key=lambda x: x[1], reverse=True)
return [self._create_result(idx, score) for idx, score in scores[:k]]
def _hybrid_search(self, query: str, k: int, alpha: float) -> List[CourseSearchResult]:
"""Combine semantic and keyword search results."""
semantic_results = self._semantic_search(query, k)
keyword_results = self._keyword_search(query, k)
# Combine scores
combined_scores = {}
for result in semantic_results:
combined_scores[result.course_code] = alpha * result.score
for result in keyword_results:
if result.course_code in combined_scores:
combined_scores[result.course_code] += (1 - alpha) * result.score
else:
combined_scores[result.course_code] = (1 - alpha) * result.score
results = []
for code, score in sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]:
row = self.courses_df[self.courses_df['Course Code'] == code].iloc[0]
results.append(self._create_result_from_row(row, score))
return results
def _create_result(self, idx: int, score: float) -> CourseSearchResult:
"""Create a CourseSearchResult from index and score."""
row = self.courses_df.iloc[idx]
return self._create_result_from_row(row, score)
def _create_result_from_row(self, row: pd.Series, score: float) -> CourseSearchResult:
"""Create a CourseSearchResult from row and score."""
return CourseSearchResult(
course_code=row['Course Code'],
course_title=row['Course Title'],
description=str(row['Course Description']),
credits=row['Credits'],
category=str(row['Category']),
department=str(row['Department']),
track=str(row['Track']),
prerequisite=str(row['Prerequisite']),
remark=str(row['Remark']),
score=score
)