import streamlit as st from pinecone import Pinecone from sentence_transformers import SentenceTransformer import re from typing import List, Dict import json st.set_page_config( page_title="Course Search Engine", page_icon="🔍", layout="wide" ) @st.cache_resource def load_links(): with open('link_dict.json', 'r', encoding='utf-8') as f: return json.load(f) @st.cache_resource def initialize(): pc = Pinecone(api_key="pcsk_48jq17_8zsXqWFqrSZVSi9fFqMnxjsa8L3iP1CPDCZ88z7j1eq5y8MZvEjwrj7yd9T5ERH") index = pc.Index("course-search5") model = SentenceTransformer('msmarco-distilbert-base-v4') return index, model def normalize_text(text: str) -> str: if not isinstance(text, str): return "" text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower()) return ' '.join(text.split()) def search_courses(query: str, n: int = 10) -> List[Dict]: query_embedding = model.encode(query).tolist() query_terms = query.lower().split() results = index.query( vector=query_embedding, top_k=30, include_metadata=True ) print(f"Initial results count: {len(results['matches'])}") filtered_results = [] for match in results['matches']: metadata = match['metadata'] course_heading = metadata['course_heading'] description = metadata.get('description', '') topics = metadata.get('topics', []) norm_heading = normalize_text(course_heading) norm_desc = normalize_text(description) title_score = 0 desc_score = 0 topic_score = 0 for term in query_terms: if term in norm_heading: title_score += 1 for term in query_terms: if term in norm_desc: desc_score += 0.5 for topic in topics: if any(term in normalize_text(topic) for term in query_terms): topic_score += 0.7 final_score = ( match['score'] * 0.4 + title_score * 0.3 + desc_score * 0.15 + topic_score * 0.15 ) filtered_results.append({ 'course': course_heading, 'score': final_score, 'semantic_score': match['score'], 'title_score': title_score, 'desc_score': desc_score, 'topic_score': topic_score, 'description': description[:200] + '...' if len(description) > 200 else description, 'topics': topics }) def get_score(item): return item['score'] filtered_results.sort(key=get_score, reverse=True) unique_results = [] seen_titles = set() for result in filtered_results: title_key = normalize_text(result['course']) if title_key not in seen_titles: seen_titles.add(title_key) unique_results.append(result) if len(unique_results) >= n: break print(f"Final results count: {len(unique_results)}") return unique_results st.title("🎓 Course Search Engine") st.write("Search for courses using natural language queries") query = st.text_input("Enter your search query:", placeholder="e.g., GenAI, Machine Learning, Python...") num_results = st.slider("Number of results:", min_value=1, max_value=20, value=10) if st.button("Search") or query: if query: with st.spinner("Searching..."): try: link_dict = load_links() index, model = initialize() results = search_courses(query, n=num_results) if results: for result in results: with st.expander(f"📚 {result['course']} (Score: {result['score']:.2f})"): st.write(f"**Description:** {result['description']}") st.write(f"**Topics:** {', '.join(result['topics'])}") if result['course'] in link_dict: st.markdown(f"[Go to Course]({link_dict[result['course']]})") st.write("---") st.write(f"Semantic Score: {result['semantic_score']:.2f}") st.write(f"Title Score: {result['title_score']:.2f}") st.write(f"Description Score: {result['desc_score']:.2f}") st.write(f"Topic Score: {result['topic_score']:.2f}") else: st.warning("No results found. Try a different search query.") except Exception as e: st.error(f"An error occurred: {str(e)}")