Spaces:
Running
Running
| import os | |
| from typing import Any | |
| from PIL import Image | |
| from pycocoevalcap.cider.cider import Cider | |
| from tqdm import tqdm | |
| def generate_caption(model: Any, processor: Any, image: Image.Image, device) -> str: | |
| """ | |
| Run the captioning model on a single image and return the decoded caption. | |
| """ | |
| inputs = processor(images=image, return_tensors="pt").to(device) | |
| with getattr(__import__("torch"), "no_grad")(): | |
| torch = __import__("torch") | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_length=30, | |
| num_beams=5, | |
| ) | |
| caption = processor.decode( | |
| generated_ids[0], | |
| skip_special_tokens=True, | |
| ) | |
| return caption | |
| def evaluate_cider(model: Any, processor: Any, val_dataset, device, max_samples: int = 200) -> float: | |
| """ | |
| Compute CIDEr score on a validation subset. | |
| Expects a PyTorch `Subset`/`Dataset` where: | |
| - `val_dataset.indices[idx]` gives the underlying index | |
| - `val_dataset.dataset.annotations[...]` is a list of dicts with | |
| keys `image` and `captions`. | |
| """ | |
| import torch # local import to avoid hard dependency for non-training paths | |
| model.eval() | |
| cider_scorer = Cider() | |
| ground_truth = {} | |
| predictions = {} | |
| for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"): | |
| real_idx = val_dataset.indices[idx] | |
| ann = val_dataset.dataset.annotations[real_idx] | |
| image_path = os.path.join("train2017", ann["image"]) | |
| image = Image.open(image_path).convert("RGB") | |
| pred_caption = generate_caption(model, processor, image, device) | |
| ground_truth[idx] = ann["captions"] | |
| predictions[idx] = [pred_caption] | |
| score, _ = cider_scorer.compute_score(ground_truth, predictions) | |
| print(f"CIDEr Score: {score:.4f}") | |
| model.train() | |
| return score | |