|
|
from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer |
|
|
import torch |
|
|
from torchvision import transforms |
|
|
from PIL import Image |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
|
|
|
def initialize_model(): |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model_dict = {} |
|
|
model, preprocess_train, preprocess_val = create_model_and_transforms( |
|
|
'ViT-H-14', |
|
|
'/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/pytorch_model.bin', |
|
|
precision='amp', |
|
|
device=device, |
|
|
jit=False, |
|
|
force_quick_gelu=False, |
|
|
force_custom_text=False, |
|
|
force_patch_dropout=False, |
|
|
force_image_size=None, |
|
|
pretrained_image=False, |
|
|
image_mean=None, |
|
|
image_std=None, |
|
|
light_augmentation=True, |
|
|
aug_cfg={}, |
|
|
output_dict=True, |
|
|
with_score_predictor=False, |
|
|
with_region_predictor=False |
|
|
) |
|
|
model_dict['model'] = model |
|
|
model_dict['preprocess_val'] = preprocess_val |
|
|
return model_dict, device |
|
|
|
|
|
def load_images_from_folder(folder): |
|
|
images = [] |
|
|
filenames = [] |
|
|
for filename in os.listdir(folder): |
|
|
if filename.endswith(".png"): |
|
|
img_path = os.path.join(folder, filename) |
|
|
image = Image.open(img_path).convert("RGB") |
|
|
images.append(image) |
|
|
filenames.append(filename) |
|
|
return images, filenames |
|
|
|
|
|
def main(): |
|
|
model_dict, device = initialize_model() |
|
|
model = model_dict['model'] |
|
|
preprocess_val = model_dict['preprocess_val'] |
|
|
|
|
|
cp = "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt" |
|
|
checkpoint = torch.load(cp, map_location=device) |
|
|
model.load_state_dict(checkpoint['state_dict']) |
|
|
tokenizer = get_tokenizer('ViT-H-14') |
|
|
reward_model = model.to(device) |
|
|
reward_model.eval() |
|
|
|
|
|
img_folder = "IMAGE_SAVE_FOLDER" |
|
|
images, filenames = load_images_from_folder(img_folder) |
|
|
|
|
|
eval_rewards = [] |
|
|
with torch.no_grad(): |
|
|
for image_pil, filename in tqdm(zip(images, filenames), total=400): |
|
|
|
|
|
image = preprocess_val(image_pil).unsqueeze(0).to(device=device, non_blocking=True) |
|
|
prompt = os.path.splitext(filename)[0] |
|
|
text = tokenizer([prompt]).to(device=device, non_blocking=True) |
|
|
outputs = reward_model(image, text) |
|
|
image_features, text_features = outputs["image_features"], outputs["text_features"] |
|
|
logits_per_image = image_features @ text_features.T |
|
|
hps_score = torch.diagonal(logits_per_image).item() |
|
|
eval_rewards.append(hps_score) |
|
|
|
|
|
avg_reward = sum(eval_rewards) / len(eval_rewards) if eval_rewards else 0 |
|
|
print(f"Average HPS score: {avg_reward:.4f}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |