Spaces:
Sleeping
Sleeping
| # 2. dataset_utils.py | |
| # Dataset loading and preprocessing | |
| from datasets import load_dataset | |
| from transformers import ViTImageProcessor | |
| from PIL import Image | |
| import os | |
| class DatasetHandler: | |
| def __init__(self, dataset_name="Gharaee/BIOSCAN-5M"): | |
| self.dataset_name = dataset_name | |
| def load_descriptions(self, max_records=500): | |
| dataset = load_dataset(self.dataset_name) | |
| descriptions = {} | |
| # Limit to the first 500 records for testing | |
| for record in dataset["train"].select(range(max_records)): | |
| species_name = record.get("species_name", "Unknown Species") | |
| description = record.get("description", "No description available.") | |
| descriptions[species_name] = description | |
| return descriptions | |
| def preprocess_image(image_path): | |
| processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") | |
| image = Image.open(image_path).convert("RGB") | |
| return processor(image, return_tensors="pt") |