File size: 711 Bytes
ed26242
f861dee
 
 
 
 
6dd2090
 
ed26242
6dd2090
 
f861dee
ed26242
 
 
f861dee
ed26242
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sklearn.datasets import fetch_20newsgroups

class DocumentRetriever:
    def __init__(self):
        self.documents = []

    def load_documents(self, subset_size=500):
        """Load a subset of 20 Newsgroups dataset."""
        newsgroups_data = fetch_20newsgroups(subset='all')
        self.documents = newsgroups_data.data[:subset_size]  # Load only the first `subset_size` documents
        print(f"Loaded {len(self.documents)} documents.")

    def retrieve(self, query):
        """Retrieve documents related to the query."""
        if not self.documents:
            return ["Document retrieval is not initialized."]
        return [doc for doc in self.documents if query.lower() in doc.lower()]