Spaces:

RoaRaoR
/

KemonoAestheticScorer

Sleeping

App Files Files Community

MUTED64 commited on Jun 24

Commit

cefcefa

•

1 Parent(s): 0668dff

change scorer

Browse files

Files changed (21) hide show

__init__.py +0 -0
api.py +5 -0
app.py +78 -26
requirements.txt +2 -1
setup.py +28 -0
waifu_scorer/__init__.py +1 -0
waifu_scorer/__pycache__/__init__.cpython-310.pyc +0 -0
waifu_scorer/__pycache__/__init__.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/mlp.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/predict.cpython-310.pyc +0 -0
waifu_scorer/__pycache__/predict.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/train.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/train_utils.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/ui.cpython-312.pyc +0 -0
waifu_scorer/__pycache__/utils.cpython-312.pyc +0 -0
waifu_scorer/mlp.py +127 -0
waifu_scorer/predict.py +63 -0
waifu_scorer/train.py +307 -0
waifu_scorer/train_utils.py +333 -0
waifu_scorer/ui.py +91 -0
waifu_scorer/utils.py +72 -0

__init__.py ADDED Viewed

File without changes

api.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from waifu_scorer.ui import launch, parse_args
+if __name__ == '__main__':
+    args = parse_args()
+    launch(args)

app.py CHANGED Viewed

@@ -1,36 +1,88 @@
 import gradio as gr
 import torch
 from PIL import Image
-from torchvision.transforms import functional as F
 from typing import List
-from transformers import CLIPModel, CLIPProcessor
 # Load the pre-trained model
-model_path = "1024_MLP_best-MSE4.1636_ep75.pth"
-model = torch.load(model_path)
-model.eval()
-# Load the CLIP model and processor
-clip_model = CLIPModel.from_pretrained("ViT-L/14")
-clip_processor = CLIPProcessor.from_pretrained("ViT-L/14")
-# Define the prediction function
-def predict(images: List[Image.Image]) -> float:
-    image_tensors = [F.to_tensor(img) for img in images]
-    inputs = clip_processor(images=image_tensors, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        outputs = model(inputs.pixel_values)
-    scores = outputs.clamp(0, 10).cpu().numpy().reshape(-1).tolist()
     return scores
-# Define the Gradio interface
-iface = gr.Interface(
-    fn=predict,
-    inputs="image",
-    outputs="number",
-    title="Kemono Aesthetic Scorer",
-    description="Predict the score of a kemono based on aesthetic features.",
 )
-# Run the Gradio interface
-iface.launch()

 import gradio as gr
 import torch
 from PIL import Image
 from typing import List
+from waifu_scorer.mlp import MLP
+import clip
 # Load the pre-trained model
+model_path = "./1024_MLP_best-MSE4.1636_ep75.pth"
+device = "cpu"
+dtype = torch.float32
+s = torch.load(model_path, map_location=device)
+model = MLP(input_size=768)
+model.load_state_dict(s)
+model.to(device=device, dtype=dtype)
+model2, preprocess = clip.load("ViT-L/14", device=device)
+def normalized(a: torch.Tensor, order=2, dim=-1):
+    l2 = a.norm(order, dim, keepdim=True)
+    l2[l2 == 0] = 1
+    return a / l2
+@torch.no_grad()
+def encode_images(images: List[Image.Image], model2, preprocess, device='cpu') -> torch.Tensor:
+    if not isinstance(images, list):
+        images = [images]
+    image_tensors = [preprocess(img).unsqueeze(0) for img in images]
+    image_batch = torch.cat(image_tensors).to(device)
+    image_features = model2.encode_image(image_batch)
+    im_emb_arr = normalized(image_features).cpu().float()
+    return im_emb_arr
+@torch.no_grad()
+def predict(inputs: List[Image.Image]) -> float:
+    images = encode_images(inputs, model2, preprocess, device=device).to(device=device, dtype=dtype)
+    predictions = model(images)
+    scores = predictions.clamp(0, 10).cpu().numpy().reshape(-1).tolist()
     return scores
+from waifu_scorer.predict import WaifuScorer, load_model
+scorer = WaifuScorer(
+    model_path=model_path,
+    model_type="mlp",
+    device=device,
 )
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(
+                label='Image',
+                type='pil',
+                height=512,
+                sources=['upload', 'clipboard'],
+            )
+        with gr.Column():
+            with gr.Row():
+                model_path = gr.Textbox(
+                    label='Model Path',
+                    value=model_path,
+                    placeholder='Path or URL to the model file',
+                    # interactive=not fix_model_path,
+                )
+            with gr.Row():
+                score = gr.Number(
+                    label='Score',
+                )
+    def change_model(model_path):
+        scorer.mlp = load_model(model_path, model_type="mlp", device=device)
+        print(f"Model changed to `{model_path}`")
+        return gr.update()
+    model_path.submit(
+        fn=change_model,
+        inputs=model_path,
+        outputs=model_path,
+    )
+    image.change(
+        fn=lambda image: predict([image]*2)[0] if image is not None else None,
+        inputs=image,
+        outputs=score,
+    )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ torch
 Pillow
 torchvision
 typing
-transformers

 Pillow
 torchvision
 typing
+pytorch_lightning
+clip

setup.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from setuptools import setup, find_packages
+with open('./requirements.txt') as f:
+    requirements = f.read().splitlines()
+for i, req in enumerate(requirements):
+    if req.startswith('git+'):
+        package_name = req.split('/')[-1].split('.')[0]  # Extract package name from URL
+        requirements[i] = f"{package_name} @ {req}"
+setup(
+    name='waifu-scorer',
+    version='0.1',
+    packages=find_packages(),
+    include_package_data=True,
+    description='Image caption tools',
+    long_description='',
+    author='euge',
+    author_email='1507064225@qq.com',
+    url='https://github.com/Eugeoter/waifu-scorer',
+    install_requires=requirements,
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.7',
+    ],
+)

waifu_scorer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .predict import WaifuScorer

waifu_scorer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (233 Bytes). View file

waifu_scorer/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (240 Bytes). View file

waifu_scorer/__pycache__/mlp.cpython-312.pyc ADDED Viewed

Binary file (5.52 kB). View file

waifu_scorer/__pycache__/predict.cpython-310.pyc ADDED Viewed

Binary file (3.04 kB). View file

waifu_scorer/__pycache__/predict.cpython-312.pyc ADDED Viewed

Binary file (4.98 kB). View file

waifu_scorer/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (14.2 kB). View file

waifu_scorer/__pycache__/train_utils.cpython-312.pyc ADDED Viewed

Binary file (14.3 kB). View file

waifu_scorer/__pycache__/ui.cpython-312.pyc ADDED Viewed

Binary file (3.89 kB). View file

waifu_scorer/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.75 kB). View file

waifu_scorer/mlp.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+class MLP(pl.LightningModule):
+    def __init__(self, input_size, xcol='emb', ycol='avg_rating', batch_norm=True):
+        super().__init__()
+        self.input_size = input_size
+        self.xcol = xcol
+        self.ycol = ycol
+        # self.layers = nn.Sequential(
+        #     nn.Linear(self.input_size, 2048),
+        #     nn.ReLU(),
+        #     nn.BatchNorm1d(2048),
+        #     nn.Dropout(0.4),
+        #     nn.Linear(2048, 512),
+        #     nn.ReLU(),
+        #     nn.BatchNorm1d(512),
+        #     nn.Dropout(0.3),
+        #     nn.Linear(512, 256),
+        #     nn.ReLU(),
+        #     nn.BatchNorm1d(256),
+        #     nn.Dropout(0.2),
+        #     nn.Linear(256, 128),
+        #     nn.ReLU(),
+        #     nn.BatchNorm1d(128),
+        #     nn.Dropout(0.1),
+        #     nn.Linear(128, 32),
+        #     nn.ReLU(),
+        #     nn.Linear(32, 1)
+        # )
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            # nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            # nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            # nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            # nn.ReLU(),
+            nn.Linear(16, 1)
+        )
+    def forward(self, x):
+        return self.layers(x)
+    def training_step(self, batch, batch_idx):
+        x = batch[self.xcol]
+        y = batch[self.ycol].reshape(-1, 1)
+        x_hat = self.layers(x)
+        loss = F.mse_loss(x_hat, y)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x = batch[self.xcol]
+        y = batch[self.ycol].reshape(-1, 1)
+        x_hat = self.layers(x)
+        loss = F.mse_loss(x_hat, y)
+        return loss
+    # def configure_optimizers(self):
+    #     optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+    #     return optimizer
+class ResidualBlock(nn.Module):
+    def __init__(self, input_size, output_size, batch_norm=True, dropout_rate=0.0):
+        super(ResidualBlock, self).__init__()
+        self.linear = nn.Linear(input_size, output_size)
+        self.relu = nn.ReLU()
+        self.batch_norm = nn.BatchNorm1d(output_size) if batch_norm else nn.Identity()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.adjust_dims = nn.Linear(input_size, output_size) if input_size != output_size else nn.Identity()
+    def forward(self, x):
+        identity = self.adjust_dims(x)
+        out = self.linear(x)
+        out = self.relu(out)
+        out = self.batch_norm(out)
+        out = self.dropout(out)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResMLP(pl.LightningModule):
+    def __init__(self, input_size, xcol='emb', ycol='avg_rating', batch_norm=True):
+        super().__init__()
+        self.input_size = input_size
+        self.xcol = xcol
+        self.ycol = ycol
+        self.layers = nn.Sequential(
+            ResidualBlock(input_size, 2048, batch_norm, dropout_rate=0.3),
+            ResidualBlock(2048, 512, batch_norm, dropout_rate=0.3),
+            ResidualBlock(512, 256, batch_norm, dropout_rate=0.2),
+            ResidualBlock(256, 128, batch_norm, dropout_rate=0.1),
+            nn.Linear(128, 32),
+            nn.ReLU(),
+            nn.Linear(32, 1)
+        )
+    def forward(self, x):
+        return self.layers(x)
+    def training_step(self, batch, batch_idx):
+        x = batch[self.xcol]
+        y = batch[self.ycol].reshape(-1, 1)
+        x_hat = self.layers(x)
+        loss = F.mse_loss(x_hat, y)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x = batch[self.xcol]
+        y = batch[self.ycol].reshape(-1, 1)
+        x_hat = self.layers(x)
+        loss = F.mse_loss(x_hat, y)
+        return loss

waifu_scorer/predict.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import clip
+import os
+from PIL import Image
+from typing import List
+from .utils import get_model_cls
+WAIFU_FILTER_V1_MODEL_REPO = 'Eugeoter/waifu-filter-v1/waifu-filter-v1.pth'
+def download_from_url(url):
+    from huggingface_hub import hf_hub_download
+    split = url.split("/")
+    username, repo_id, model_name = split[-3], split[-2], split[-1]
+    model_path = hf_hub_download(f"{username}/{repo_id}", model_name)
+    return model_path
+def load_model(model_path: str = None, model_type='mlp', input_size=768, device: str = 'cuda', dtype=torch.float32):
+    model_cls = get_model_cls(model_type)
+    model = model_cls(input_size=input_size)
+    if not os.path.isfile(model_path):
+        model_path = download_from_url(model_path)
+    s = torch.load(model_path, map_location=device)
+    model.load_state_dict(s)
+    model.to(device=device, dtype=dtype)
+    return model
+def normalized(a: torch.Tensor, order=2, dim=-1):
+    l2 = a.norm(order, dim, keepdim=True)
+    l2[l2 == 0] = 1
+    return a / l2
+@torch.no_grad()
+def encode_images(images: List[Image.Image], model2, preprocess, device='cuda') -> torch.Tensor:
+    if isinstance(images, Image.Image):
+        images = [images]
+    image_tensors = [preprocess(img).unsqueeze(0) for img in images]
+    image_batch = torch.cat(image_tensors).to(device)
+    image_features = model2.encode_image(image_batch)
+    im_emb_arr = normalized(image_features).cpu().float()
+    return im_emb_arr
+class WaifuScorer:
+    def __init__(self, model_path: str = WAIFU_FILTER_V1_MODEL_REPO, model_type='mlp', device: str = None, dtype=torch.float32):
+        print(f"loading model from `{model_path}`...")
+        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.mlp = load_model(model_path, model_type=model_type, input_size=768, device=device, dtype=dtype)
+        self.mlp.eval()
+        self.model2, self.preprocess = clip.load("ViT-L/14", device=device)
+        self.device = self.mlp.device
+        self.dtype = self.mlp.dtype
+        print(f"model loaded: cls={model_type} | device={self.device} | dtype={self.dtype}")
+    @torch.no_grad()
+    def predict(self, images: List[Image.Image]) -> float:
+        images = encode_images(images, self.model2, self.preprocess, device=self.device).to(device=self.device, dtype=self.dtype)
+        predictions = self.mlp(images)
+        scores = predictions.clamp(0, 10).cpu().numpy().reshape(-1).tolist()
+        return scores

waifu_scorer/train.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# os.environ['CUDA_VISIBLE_DEVICES'] = "0"       # in case you are using a multi GPU workstation, choose your GPU here
+import os
+import torch
+import random
+import torch.nn as nn
+from pathlib import Path
+from tqdm import tqdm
+from accelerate import Accelerator
+from typing import Literal, Callable, Optional, Union
+from waifuset.utils import log_utils
+from waifuset.classes import Dataset, ImageInfo
+from . import mlp, utils, train_utils
+StrPath = Union[str, Path]
+def train(
+    dataset_source,
+    save_path,
+    resume_path: StrPath = None,
+    data_preprocessor: Optional[Callable[[ImageInfo], float]] = None,
+    rating_func_type: Union[Callable[[ImageInfo], float], Literal['direct', 'label', 'quality']] = 'quality',
+    num_train_epochs=50,
+    learning_rate=1e-3,
+    train_batch_size=256,
+    shuffle=True,
+    flip_aug=True,
+    val_batch_size=512,
+    val_every_n_epochs=1,
+    val_percentage=0.05,  # 5% of the training data will be used for validation
+    save_best_model=True,
+    clip_batch_size=1,
+    cache_to_disk: bool = False,
+    cache_path: StrPath = None,
+    mixed_precision=None,
+    max_data_loader_n_workers: int = 4,
+    persistent_workers=False,
+    mlp_model_type: Literal['default', 'large'] = 'default',
+    clip_model_name: str = "ViT-L/14",
+    input_size: int = 768,
+    batch_norm: bool = True,
+):
+    r"""
+    :param dataset_source: any dataset source, e.g. path to the dataset.
+    :param save_path: path to save the trained model.
+    :param resume_path: path to the model to resume from.
+    :param cache_to_disk: whether to cache the training data to disk.
+    :param cache_path: path to the cached training data. If not exists, will be created from `dataset_source`. If exists, will be loaded from disk.
+    :param num_train_epochs: number of training epochs.
+    :param learning_rate: learning rate.
+    :param train_batch_size: training batch size.
+    :param val_batch_size: validation batch size.
+    :param val_every_n_epochs: validation frequency.
+    :param val_percentage: percentage of the training data to be used for validation.
+    :param encoder_batch_size: batch size for encoding images.
+    :param mixed_precision: whether to use mixed precision training.
+    :param max_data_loader_n_workers: maximum number of workers for data loaders.
+    :param persistent_workers: whether to use persistent workers for data loaders.
+    :param input_size: input size of the model.
+    """
+    log_utils.info(f"prepare for training")
+    accelerator = Accelerator(mixed_precision=mixed_precision)
+    weight_dtype = train_utils.prepare_dtype(mixed_precision)
+    device = accelerator.device
+    max_data_loader_n_workers = min(max_data_loader_n_workers, os.cpu_count()-1)
+    if callable(rating_func_type):
+        rating_func = rating_func_type
+    else:
+        rating_func = train_utils.get_rating_func(rating_func_type)
+    model2, preprocess = utils.load_clip_models(name=clip_model_name, device=device)  # RN50x64
+    dataset = Dataset(dataset_source, verbose=True, condition=lambda img_info: img_info.image_path.is_file())
+    if data_preprocessor:
+        for img_key, img_info in dataset.items():
+            img_info = data_preprocessor(img_info)
+    keys = list(dataset.keys())
+    random.shuffle(keys)
+    dataset = Dataset({k: dataset[k] for k in keys})
+    num_pos = 0
+    num_neg = 0
+    num_mid = 0
+    for img_key, img_info in dataset.items():
+        rating = rating_func(img_info)
+        if rating == 10:
+            num_pos += 1
+        elif rating == 0:
+            num_neg += 1
+        else:
+            num_mid += 1
+    log_utils.info(f"num_pos: {num_pos} | num_mid: {num_mid} | num_neg: {num_neg}")
+    train_size = int(len(dataset) * (1 - val_percentage))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = Dataset(dataset.values()[:train_size]), Dataset(dataset.values()[train_size:])
+    log_utils.info(f"train_size: {train_size} | val_size: {val_size}")
+    train_dataset, train_loader = train_utils.prepare_dataloader(
+        train_dataset,
+        batch_size=train_batch_size,
+        clip_batch_size=clip_batch_size,
+        model2=model2,
+        preprocess=preprocess,
+        input_size=input_size,
+        rating_func=rating_func,
+        shuffle=shuffle,
+        flip_aug=flip_aug,
+        cache_to_disk=cache_to_disk,
+        cache_path=cache_path,
+        max_data_loader_n_workers=max_data_loader_n_workers,
+        persistent_workers=persistent_workers,
+        device=device,
+    )
+    val_dataset, val_loader = train_utils.prepare_dataloader(
+        val_dataset,
+        batch_size=val_batch_size,
+        clip_batch_size=clip_batch_size,
+        model2=model2,
+        preprocess=preprocess,
+        rating_func=rating_func,
+        shuffle=shuffle,
+        flip_aug=flip_aug,
+        cache_to_disk=cache_to_disk,
+        cache_path=cache_path,
+        max_data_loader_n_workers=max_data_loader_n_workers,
+        persistent_workers=persistent_workers,
+        device=device,
+    )
+    rating_stat = {}
+    for i in range(len(train_dataset)):
+        # to list
+        ratings: torch.Tensor = train_dataset[i]['ratings']
+        ratings = ratings.squeeze().tolist()
+        for rating in ratings:
+            if rating not in rating_stat:
+                rating_stat[rating] = 0
+            rating_stat[rating] += 1
+    log_utils.info("rating_stat:\n", '\n'.join(f'{k}: {v}' for k, v in rating_stat.items()))
+    # prepare model
+    model: mlp.MLP = utils.load_model(resume_path, model_type=mlp_model_type, input_size=input_size, batch_norm=batch_norm, device=device, dtype=weight_dtype)
+    # import prodigyopt
+    # print(f"use Prodigy optimizer | {optimizer_kwargs}")
+    # optimizer_class = prodigyopt.Prodigy
+    # optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, T_mult=2)
+    # choose the loss you want to optimize for
+    criterion = nn.MSELoss(reduction='mean')
+    criterion2 = nn.L1Loss(reduction='mean')
+    model, optimizer, train_loader, val_loader = accelerator.prepare(
+        model, optimizer, train_loader, val_loader
+    )
+    log_utils.info(f"device: {accelerator.device}")
+    # training loop
+    best_loss = 999  # best validation loss
+    total_train_steps = len(train_loader) * num_train_epochs
+    progress_bar = tqdm(range(total_train_steps), position=0, leave=True)
+    print(f"total_train_steps: {total_train_steps}")
+    class LossRecorder:
+        def __init__(self):
+            self.loss_list = []
+            self.loss_total: float = 0.0
+        def add(self, *, epoch: int, step: int, loss: float) -> None:
+            if epoch == 0:
+                self.loss_list.append(loss)
+            else:
+                self.loss_total -= self.loss_list[step]
+                self.loss_list[step] = loss
+            self.loss_total += loss
+        @property
+        def moving_average(self) -> float:
+            return self.loss_total / len(self.loss_list)
+    loss_recorder = LossRecorder()
+    model.requires_grad_(True)
+    save_on_end = False
+    try:
+        for epoch in range(num_train_epochs):
+            model.train()
+            losses = []
+            losses2 = []
+            for step, input_data in enumerate(train_loader):
+                optimizer.zero_grad(set_to_none=True)
+                im_emb_arr: torch.Tensor = input_data['im_emb_arrs'].to(accelerator.device).to(dtype=weight_dtype)  # shape: (batch_size, input_size)
+                rating: torch.Tensor = input_data['ratings'].to(accelerator.device).to(dtype=weight_dtype)  # shape: (batch_size, 1)
+                # randomize the rating
+                # rating_std = 0.5
+                # rating = rating + torch.randn_like(rating) * rating_std
+                # log_utils.debug(f"x.dtype: {x.dtype} | y.dtype: {y.dtype} | model.dtype: {model.dtype}")
+                with accelerator.autocast():
+                    output = model(im_emb_arr)
+                loss = criterion(output, rating)
+                accelerator.backward(loss)
+                losses.append(loss.detach().item())
+                optimizer.step()
+                # if step % 1000 == 0:
+                #     print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, step, loss.item()))
+                #     # print(y)
+                progress_bar.update(1)
+                current_loss = loss.detach().item()
+                loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+                avr_loss: float = loss_recorder.moving_average
+                pbar_logs = {
+                    'lr': f"{lr_scheduler.get_last_lr()[0]:.3e}",
+                    'epoch': epoch,
+                    'loss': avr_loss,
+                }
+                progress_bar.set_postfix(pbar_logs)
+            progress_bar.write('epoch %d | avg loss %6.6f' % (epoch, avr_loss))
+            # validation
+            if accelerator.is_main_process and epoch > 0 and epoch % val_every_n_epochs == 0:
+                model.eval()
+                with torch.no_grad():
+                    losses = []
+                    losses2 = []
+                    for step, input_data in enumerate(val_loader):
+                        # optimizer.zero_grad(set_to_none=True)
+                        im_emb_arr = input_data['im_emb_arrs'].to(accelerator.device).to(dtype=weight_dtype)
+                        rating = input_data['ratings'].to(accelerator.device).to(dtype=weight_dtype)
+                        with accelerator.autocast():
+                            output = model(im_emb_arr)
+                        loss = criterion(output, rating)
+                        lossMAE = criterion2(output, rating)
+                        # loss.backward()
+                        losses.append(loss.detach().item())
+                        losses2.append(lossMAE.detach().item())
+                        # optimizer.step()
+                        # if step % 1000 == 0:
+                        #     print('\tValidation - Epoch %d | Batch %d | MSE Loss %6.2f' % (epoch, step, loss.item()))
+                        #     print('\tValidation - Epoch %d | Batch %d | MAE Loss %6.2f' % (epoch, step, lossMAE.item()))
+                        # print(y)
+                    current_loss = sum(losses)/len(losses)
+                    s = [f"validation - epoch {log_utils.stylize(epoch, log_utils.ANSI.YELLOW)}"]
+                    s.append(f"avg MSE loss {log_utils.stylize(current_loss, log_utils.ANSI.GREEN, format_spec='.4f')}")
+                    s.append(f"avg MAE loss {log_utils.stylize(sum(losses2)/len(losses2), log_utils.ANSI.YELLOW, format_spec='.4f')}")
+                    progress_bar.write(' | '.join(s))
+                    # progress_bar.write('validation - epoch %d | avg MSE loss %6.4f' % (epoch, sum(losses)/len(losses)))
+                    # progress_bar.write('validation - epoch %d | avg MAE loss %6.4f' % (epoch, sum(losses2)/len(losses2)))
+                    if save_best_model and current_loss < best_loss:
+                        best_loss = current_loss
+                        progress_bar.write(f"best MSE val loss ({log_utils.stylize(best_loss, log_utils.ANSI.BOLD, log_utils.ANSI.GREEN)}) so far. saving model...")
+                        best_save_path = Path(save_path).parent / f"{Path(save_path).stem}_best-MSE{best_loss:.4f}{Path(save_path).suffix}"
+                        train_utils.save_model(model, best_save_path, epoch=epoch)
+                        progress_bar.write(f"model saved: `{save_path}`")
+            lr_scheduler.step()
+            accelerator.wait_for_everyone()
+    except KeyboardInterrupt:
+        log_utils.warn("KeyboardInterrupt")
+        if input(f"save model to {save_path}? [y/n]") == 'y':
+            save_on_end = True
+    else:
+        save_on_end = True
+    progress_bar.close()
+    model = accelerator.unwrap_model(model)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process and save_on_end:
+        log_utils.info("saving model...")
+        train_utils.save_model(model, save_path)
+        log_utils.info(f"model saved: `{save_path}`")
+    del accelerator
+    log_utils.success(f"training done. best loss: {best_loss}")
+    # inferece test with dummy samples from the val set, sanity check
+    # log_utils.info("inference test with dummy samples from the val set, sanity check")
+    # model.eval()
+    # output = model(x[:5].to(device))
+    # log_utils.info(output.size())
+    # log_utils.info(output)

waifu_scorer/train_utils.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import torch
+import h5py
+import math
+import random
+from torch.utils.data import DataLoader
+from pathlib import Path
+from typing import List, Callable, Tuple
+from tqdm import tqdm
+from PIL import Image
+from waifuset.classes import Dataset, ImageInfo
+from waifuset.utils import log_utils
+from .utils import encode_images, load_clip_models, quality_rating
+class LaionImageInfo:
+    def __init__(
+        self,
+        img_path=None,
+        im_emb_arr=None,
+        rating=None,
+        im_emb_arr_flipped=None,
+        num_repeats=1,
+    ):
+        self.img_path = img_path
+        self.im_emb_arr = im_emb_arr
+        self.rating = rating
+        self.im_emb_arr_flipped = im_emb_arr_flipped
+        self.num_repeats = num_repeats
+class LaionDataset:
+    def __init__(
+        self,
+        source,
+        cache_to_disk=True,
+        cache_path=None,
+        batch_size=1,
+        clip_batch_size=4,
+        model2=None,
+        preprocess=None,
+        input_size=768,
+        rating_func: Callable = quality_rating,
+        repeating_func: Callable = None,
+        shuffle=True,
+        flip_aug: bool = True,
+        device='cuda'
+    ):
+        if model2 is None or preprocess is None:
+            model2, preprocess = load_clip_models(device)  # RN50x64
+        if cache_to_disk and cache_path is None:
+            raise ValueError("cache_path must be specified when cache_to_disk is True.")
+        self.source = source
+        self.cache_to_disk = cache_to_disk
+        self.cache_path = Path(cache_path)
+        self.model2, self.preprocess = model2, preprocess
+        self.input_size = input_size
+        self.rating_func = rating_func
+        self.batch_size = batch_size
+        self.encoder_batch_size = clip_batch_size
+        self.shuffle = shuffle
+        self.flip_aug = flip_aug
+        self.device = device
+        dataset: Dataset = Dataset(source, verbose=True)
+        self.image_data = []
+        for img_key, img_info in tqdm(dataset.items(), desc='prepare dataset'):
+            img_path = img_info.image_path
+            rating = self.rating_func(img_info)
+            laion_image_info = LaionImageInfo(
+                img_path=img_path,
+                rating=rating,
+            )
+            self.register_image_info(laion_image_info)
+        rating_counter = {}
+        for laion_img_info in tqdm(self.image_data, desc='calculating num repeats (1/2)'):
+            # to list
+            rating: torch.Tensor = laion_img_info.rating
+            rating_counter.setdefault(rating, 0)
+            rating_counter[rating] += 1
+        for laion_img_info in tqdm(self.image_data, desc='calculating num repeats (2/2)'):
+            benchmark = 30000
+            num_repeats = benchmark / rating_counter[laion_img_info.rating]
+            prob = num_repeats - math.floor(num_repeats)
+            num_repeats = math.floor(num_repeats) if random.random() < prob else math.ceil(num_repeats)
+            laion_img_info.num_repeats = max(1, num_repeats)
+        self.cache_embs()
+        self.batches = self.make_batches()
+    def register_image_info(self, image_info: LaionImageInfo):
+        self.image_data.append(image_info)
+    def cache_embs(self):
+        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+        not_cached = []  # list of (image_info, flipped)
+        num_cached = 0
+        # load cache
+        if self.cache_to_disk:
+            pbar = tqdm(total=len(self.image_data), desc='loading cache')
+            def load_cached_emb(h5, image_info: LaionImageInfo, flipped=False):
+                nonlocal num_cached
+                image_key = image_info.img_path.stem
+                if flipped:
+                    image_key = image_key + '_flipped'
+                if image_key in h5:
+                    im_emb_arr = torch.from_numpy(f[image_key][:])
+                    if im_emb_arr.shape[-1] != self.input_size:
+                        raise ValueError(f"Input size mismatched. Except {self.input_size} dim, but got {im_emb_arr.shape[-1]} dim loaded. Please check your cache file.")
+                    assert im_emb_arr.device == torch.device('cpu'), "flipped image emb should be on cpu"
+                    if flipped:
+                        image_info.im_emb_arr_flipped = im_emb_arr
+                    else:
+                        image_info.im_emb_arr = im_emb_arr
+                    num_cached += 1
+                else:
+                    not_cached.append((image_info, flipped))
+            if not is_h5_file(self.cache_path):
+                # create cache
+                log_utils.info(f"cache file not found, creating new cache file: {self.cache_path}")
+                with h5py.File(self.cache_path, 'w') as f:
+                    pass
+            else:
+                log_utils.info(f"loading cache file: {self.cache_path}")
+            with h5py.File(self.cache_path, 'r') as f:
+                for image_info in self.image_data:
+                    load_cached_emb(f, image_info, flipped=False)
+                    if self.flip_aug:
+                        load_cached_emb(f, image_info, flipped=True)
+                    pbar.update()
+            pbar.close()
+        else:
+            not_cached = [(image_info, False) for image_info in self.image_data]
+            if self.flip_aug:
+                not_cached += [(image_info, True) for image_info in self.image_data]
+        # encode not-cached images
+        if len(not_cached) == 0:
+            log_utils.info("all images are cached.")
+        else:
+            log_utils.info(f"number of cached instances: {num_cached}")
+            log_utils.info(f"number of not cached instances: {len(not_cached)}")
+            batches = [not_cached[i:i + self.encoder_batch_size] for i in range(0, len(not_cached), self.encoder_batch_size)]
+            pbar = tqdm(total=len(batches), desc='encoding images')
+            def cache_batch_embs(h5, batch: List[Tuple[LaionImageInfo, bool]]):
+                try:
+                    images = [Image.open(image_info.img_path) if not flipped else Image.open(image_info.img_path).transpose(Image.FLIP_LEFT_RIGHT) for image_info, flipped in batch]
+                except:
+                    log_utils.error(f"Error occurred when loading one of the images: {[image_info.img_path for image_info, flipped in batch]}")
+                    raise
+                im_emb_arrs = encode_images(images, self.model2, self.preprocess, device=self.device)  # shape: [batch_size, input_size]
+                for i, item in enumerate(batch):
+                    image_info, flipped = item
+                    im_emb_arr = im_emb_arrs[i]
+                    shape_size = len(im_emb_arr.shape)
+                    if shape_size == 1:
+                        im_emb_arr = im_emb_arr.unsqueeze(0)
+                    elif shape_size == 3:
+                        im_emb_arr = im_emb_arr.squeeze(1)
+                    image_key = image_info.img_path.stem
+                    assert im_emb_arr.device == torch.device('cpu'), "flipped image emb should be on cpu"
+                    if flipped:
+                        image_key = image_key + '_flipped'
+                        image_info.im_emb_arr_flipped = im_emb_arr
+                    else:
+                        image_info.im_emb_arr = im_emb_arr
+                    if self.cache_to_disk:
+                        if image_key in h5:
+                            continue
+                        h5.create_dataset(image_key, data=im_emb_arr.cpu().numpy())
+            try:
+                h5 = h5py.File(self.cache_path, 'a') if self.cache_to_disk else None
+                for batch in batches:
+                    cache_batch_embs(h5, batch)
+                    pbar.update()
+            finally:
+                if h5:
+                    h5.close()
+            pbar.close()
+    def make_batches(self):
+        batches = []
+        repeated_image_data = []
+        for image_info in self.image_data:
+            repeated_image_data += [image_info] * image_info.num_repeats
+        log_utils.info(f"number of instances (repeated): {len(repeated_image_data)}")
+        for i in range(0, len(repeated_image_data), self.batch_size):
+            batch = repeated_image_data[i:i + self.batch_size]
+            batches.append(batch)
+        if self.shuffle:
+            random.shuffle(batches)
+        return batches
+    def __getitem__(self, index):
+        batch = self.batches[index]
+        im_emb_arrs = []
+        ratings = []
+        for image_info in batch:
+            flip = self.flip_aug and random.random() > 0.5
+            if not flip:
+                im_emb_arr = image_info.im_emb_arr
+            else:
+                im_emb_arr = image_info.im_emb_arr_flipped
+            rating = image_info.rating
+            im_emb_arrs.append(im_emb_arr)
+            ratings.append(rating)
+        im_emb_arrs = torch.cat(im_emb_arrs, dim=0)
+        ratings = torch.tensor(ratings).unsqueeze(-1)
+        sample = dict(
+            im_emb_arrs=im_emb_arrs,
+            ratings=ratings,
+        )
+        return sample
+    def __len__(self):
+        return len(self.batches)
+def collate_fn(batch):
+    return batch[0]
+def get_rating_func(rating_func_type: str):
+    if rating_func_type == 'quality':
+        from .utils import quality_rating
+        rating_func = quality_rating
+    else:
+        raise ValueError(f"Invalid rating type: {rating_func_type}")
+    return rating_func
+def prepare_dataloader(
+    dataset_source,
+    cache_to_disk=True,
+    cache_path=None,
+    batch_size=1,
+    clip_batch_size=4,
+    model2=None,
+    preprocess=None,
+    input_size=768,
+    rating_func: Callable = quality_rating,
+    shuffle=True,
+    flip_aug: bool = True,
+    device='cuda',
+    persistent_workers=False,
+    max_data_loader_n_workers=0,
+):
+    dataset = LaionDataset(
+        dataset_source,
+        cache_to_disk=cache_to_disk,
+        cache_path=cache_path,
+        batch_size=batch_size,
+        clip_batch_size=clip_batch_size,
+        model2=model2,
+        preprocess=preprocess,
+        input_size=input_size,
+        rating_func=rating_func,
+        shuffle=shuffle,
+        flip_aug=flip_aug,
+        device=device,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=1,  # fix to 1
+        shuffle=shuffle,
+        num_workers=max_data_loader_n_workers,
+        persistent_workers=persistent_workers,
+        collate_fn=collate_fn,
+    )
+    return dataset, dataloader
+def is_h5_file(cache_path):
+    if not cache_path or not h5py.is_hdf5(cache_path):
+        return False
+    return True
+# def make_train_data(
+#     dataset_source,
+#     rating_func: Callable = quality_rating,
+#     batch_size=1,
+#     flip_aug: bool = True,
+#     device='cuda'
+# ):
+#     model2, preprocess = clip.load("ViT-L/14", device=device)  # RN50x64
+#     dataset = Dataset.from_source(dataset_source, verbose=True)
+#     x_train = []
+#     y_train = []
+#     batches = [dataset[i:i + batch_size] for i in range(0, len(dataset), batch_size)]
+#     for batch in tqdm(batches, desc='encoding images', smoothing=1):
+#         im_emb_arr = encode_images([d.pil_img for d in batch], model2, preprocess, device=device)  # shape: [batch_size, 768]
+#         ratings = torch.tensor([rating_func(data) for data in batch]).unsqueeze(-1).to(device)  # shape: [batch_size, 1]
+#         x_train.append(im_emb_arr)
+#         y_train.append(ratings)
+#     x_train = torch.cat(x_train, dim=0)
+#     y_train = torch.cat(y_train, dim=0)
+#     return x_train, y_train
+def prepare_dtype(mixed_precision: str):
+    weight_dtype = torch.float32
+    if mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    return weight_dtype
+def save_model(model, save_path, epoch=None):
+    save_path = str(save_path)
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    if epoch is not None:
+        save_path = save_path.replace('.pth', f'_ep{epoch}.pth')
+    torch.save(model.state_dict(), save_path)
+    return save_path

waifu_scorer/ui.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+from argparse import ArgumentParser
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default='./model/v3.pth',
+        help='Path or url to the model file',
+    )
+    parser.add_argument(
+        '--model_type',
+        type=str,
+        default='mlp',
+        help='Type of the model',
+    )
+    parser.add_argument(
+        '--fix_model_path',
+        action='store_true',
+        help='Fix the model path',
+    )
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='cuda',
+        help='Device to use',
+    )
+    parser.add_argument(
+        '--share',
+        action='store_true',
+        help='Share the demo',
+    )
+    return parser.parse_args()
+def ui(args):
+    from waifu_scorer.predict import WaifuScorer, load_model
+    scorer = WaifuScorer(
+        model_path=args.model_path,
+        model_type=args.model_type,
+        device=args.device,
+    )
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                image = gr.Image(
+                    label='Image',
+                    type='pil',
+                    height=512,
+                    sources=['upload', 'clipboard'],
+                )
+            with gr.Column():
+                with gr.Row():
+                    model_path = gr.Textbox(
+                        label='Model Path',
+                        value=args.model_path,
+                        placeholder='Path or URL to the model file',
+                        interactive=not args.fix_model_path,
+                    )
+                with gr.Row():
+                    score = gr.Number(
+                        label='Score',
+                    )
+        def change_model(model_path):
+            nonlocal scorer
+            scorer.mlp = load_model(model_path, model_type=args.model_type, device=args.device)
+            print(f"Model changed to `{model_path}`")
+            return gr.update()
+        model_path.submit(
+            fn=change_model,
+            inputs=model_path,
+            outputs=model_path,
+        )
+        image.change(
+            fn=lambda image: scorer.predict([image]*2)[0] if image is not None else None,
+            inputs=image,
+            outputs=score,
+        )
+    return demo
+def launch(args):
+    demo = ui(args)
+    demo.launch(share=args.share)

waifu_scorer/utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import clip
+from PIL import Image
+from typing import List, Union
+from . import mlp
+QUALITY_TO_RATING = {
+    'amazing': 10,
+    'best': 8.5,
+    'high': 7,
+    'normal': 5,
+    'low': 2.5,
+    'worst': 0,
+    'horrible': 0,
+}
+MODEL_TYPE = {
+    'mlp': mlp.MLP,
+    'res_mlp': mlp.ResMLP,
+}
+def quality_rating(img_info):
+    quality = (img_info.caption.quality or 'normal') if img_info.caption is not None else 'normal'
+    rating = QUALITY_TO_RATING[quality]
+    return rating
+def get_model_cls(model_type) -> Union[mlp.MLP, None]:
+    return MODEL_TYPE.get(model_type, mlp.MLP)
+def load_clip_models(name: str = "ViT-L/14", device='cuda'):
+    model2, preprocess = clip.load(name, device=device)  # RN50x64
+    return model2, preprocess
+def load_model(model_path: str = None, model_type=None, input_size=768, batch_norm: bool = True, device: str = 'cuda', dtype=None):
+    model_cls = get_model_cls(model_type)
+    print(f"Loading model from class `{model_cls}`...")
+    model_kwargs = {}
+    if model_type in ('large', 'res_large'):
+        model_kwargs['batch_norm'] = True
+    model = model_cls(input_size, **model_kwargs)
+    if model_path:
+        try:
+            s = torch.load(model_path, map_location=device)
+            model.load_state_dict(s)
+        except Exception as e:
+            print(f"Model type mismatch. Desired model type: `{model_type}` (model class: `{model_cls}`).")
+            raise e
+        model.to(device)
+    if dtype:
+        model = model.to(dtype=dtype)
+    return model
+def normalized(a: torch.Tensor, order=2, dim=-1):
+    l2 = a.norm(order, dim, keepdim=True)
+    l2[l2 == 0] = 1
+    return a / l2
+@torch.no_grad()
+def encode_images(images: List[Image.Image], model2, preprocess, device='cuda') -> torch.Tensor:
+    if isinstance(images, Image.Image):
+        images = [images]
+    image_tensors = [preprocess(img).unsqueeze(0) for img in images]
+    image_batch = torch.cat(image_tensors).to(device)
+    image_features = model2.encode_image(image_batch)
+    im_emb_arr = normalized(image_features).cpu().float()
+    return im_emb_arr