Spaces:
Running
Running
File size: 711 Bytes
ed26242 f861dee 6dd2090 ed26242 6dd2090 f861dee ed26242 f861dee ed26242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
from sklearn.datasets import fetch_20newsgroups
class DocumentRetriever:
def __init__(self):
self.documents = []
def load_documents(self, subset_size=500):
"""Load a subset of 20 Newsgroups dataset."""
newsgroups_data = fetch_20newsgroups(subset='all')
self.documents = newsgroups_data.data[:subset_size] # Load only the first `subset_size` documents
print(f"Loaded {len(self.documents)} documents.")
def retrieve(self, query):
"""Retrieve documents related to the query."""
if not self.documents:
return ["Document retrieval is not initialized."]
return [doc for doc in self.documents if query.lower() in doc.lower()]
|