from pinecone import Pinecone from sentence_transformers import SentenceTransformer import re from typing import List, Dict import json with open('link_dict.json', 'r', encoding='utf-8') as f: link_dict = json.load(f) def initialize(): pc = Pinecone(api_key="pcsk_48jq17_8zsXqWFqrSZVSi9fFqMnxjsa8L3iP1CPDCZ88z7j1eq5y8MZvEjwrj7yd9T5ERH") index = pc.Index("course-search5") model = SentenceTransformer('msmarco-distilbert-base-v4') return index, model index, model = initialize() def normalize_text(text: str) -> str: if not isinstance(text, str): return "" text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower()) return ' '.join(text.split()) def search_courses(query: str, n: int = 10) -> List[Dict]: query_embedding = model.encode(query).tolist() query_terms = query.lower().split() results = index.query( vector=query_embedding, top_k=30, include_metadata=True ) print(f"Initial results count: {len(results['matches'])}") filtered_results = [] for match in results['matches']: metadata = match['metadata'] course_heading = metadata['course_heading'] description = metadata.get('description', '') topics = metadata.get('topics', []) norm_heading = normalize_text(course_heading) norm_desc = normalize_text(description) title_score = 0 desc_score = 0 topic_score = 0 for term in query_terms: if term in norm_heading: title_score += 1 for term in query_terms: if term in norm_desc: desc_score += 0.5 for topic in topics: if any(term in normalize_text(topic) for term in query_terms): topic_score += 0.7 final_score = ( match['score'] * 0.4 + title_score * 0.3 + desc_score * 0.15 + topic_score * 0.15 ) filtered_results.append({ 'course': course_heading, 'score': final_score, 'semantic_score': match['score'], 'title_score': title_score, 'desc_score': desc_score, 'topic_score': topic_score, 'description': description[:200] + '...' if len(description) > 200 else description, 'topics': topics }) def get_score(item): return item['score'] filtered_results.sort(key=get_score, reverse=True) unique_results = [] seen_titles = set() for result in filtered_results: title_key = normalize_text(result['course']) if title_key not in seen_titles: seen_titles.add(title_key) unique_results.append(result) if len(unique_results) >= n: break print(f"Final results count: {len(unique_results)}") return unique_results query = "GenAI" print(f"Searching for: {query}") results = search_courses(query, n=10) for result in results: print("\n" + "="*50) print(f"Course: {result['course']}") course = result['course'] print(f"Link: {link_dict[course]}") print(f"Overall Score: {result['score']:.2f}") print(f"Semantic Score: {result['semantic_score']:.2f}") print(f"Title Score: {result['title_score']:.2f}") print(f"Description Score: {result['desc_score']:.2f}") print(f"Topic Score: {result['topic_score']:.2f}") print(f"Topics: {', '.join(result['topics'])}") print(f"Preview: {result['description']}")