Spaces:
Sleeping
Sleeping
| import os | |
| def load_documents(dataset_path): | |
| documents = [] | |
| labels = [] | |
| for category in os.listdir(dataset_path): | |
| category_path = os.path.join(dataset_path, category) | |
| if os.path.isdir(category_path): | |
| for file in os.listdir(category_path): | |
| file_path = os.path.join(category_path, file) | |
| try: | |
| with open(file_path, "r", encoding="latin-1") as f: | |
| text = f.read() | |
| documents.append(text) | |
| labels.append(category) | |
| except: | |
| continue | |
| return documents, labels | |
| if __name__ == "__main__": | |
| dataset_path = "data/20_newsgroups" | |
| docs, labels = load_documents(dataset_path) | |
| print("Total documents:", len(docs)) | |
| print("Example category:", labels[0]) |