import faiss from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn.datasets import fetch_20newsgroups class DocumentRetriever: def __init__(self): self.documents = [] def load_documents(self): """Load 20 Newsgroups dataset.""" newsgroups_data = fetch_20newsgroups(subset='all') self.documents = newsgroups_data.data if not self.documents: print("No documents loaded!") def retrieve(self, query): """Retrieve documents related to the query.""" if not self.documents: return ["Document retrieval is not initialized."] # Simple keyword match (can replace with advanced semantic similarity later) return [doc for doc in self.documents if query.lower() in doc.lower()]