import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups

class DocumentRetriever:
    def __init__(self):
        self.documents = []

    def load_documents(self):
        """Load 20 Newsgroups dataset."""
        newsgroups_data = fetch_20newsgroups(subset='all')
        self.documents = newsgroups_data.data
        if not self.documents:
            print("No documents loaded!")

    def retrieve(self, query):
        """Retrieve documents related to the query."""
        if not self.documents:
            return ["Document retrieval is not initialized."]
        # Simple keyword match (can replace with advanced semantic similarity later)
        return [doc for doc in self.documents if query.lower() in doc.lower()]