Spaces:

yeq6x
/

TripletGeoEncoder-demo

Running on Zero

App Files Files Community

yeq6x commited on Nov 11, 2024

Commit

02ba63a

1 Parent(s): 471c8d4

init

Browse files

Files changed (12) hide show

Dockerfile +20 -0
app.py +229 -0
checkpoints/ae_model_tf_2024-03-05_00-35-21.pth +3 -0
checkpoints/autoencoder-epoch=09-train_loss=1.00.ckpt +3 -0
checkpoints/autoencoder-epoch=29-train_loss=1.01.ckpt +3 -0
checkpoints/autoencoder-epoch=49-train_loss=1.01.ckpt +3 -0
datamodule.py +25 -0
dataset.py +88 -0
model.py +62 -0
model_module.py +145 -0
requirements.txt +9 -0
utils.py +252 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# ベースイメージとしてPython 3.9を使用
+FROM python:3.9-slim
+# 作業ディレクトリを設定
+WORKDIR /app
+# 必要なPythonライブラリをインストールするための依存ファイルをコピー
+COPY requirements.txt /app/requirements.txt
+# 必要なPythonパッケージをインストール
+RUN pip install --no-cache-dir -r requirements.txt
+# アプリケーションコードをコンテナにコピー
+COPY . /app
+# ポート設定（Gradioのデフォルトポート7860）
+EXPOSE 7860
+# アプリケーションを起動
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import gradio as gr
+import spaces
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import matplotlib.pyplot as plt
+from model_module import AutoencoderModule
+from dataset import MyDataset, load_filenames
+from utils import DistanceMapLogger
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+# モデルとデータの読み込み
+def load_model():
+    model_path = "checkpoints/ae_model_tf_2024-03-05_00-35-21.pth"
+    feature_dim = 32
+    model = AutoencoderModule(feature_dim=feature_dim)
+    state_dict = torch.load(model_path)
+    # state_dict のキーを修正
+    new_state_dict = {}
+    for key in state_dict:
+        new_key = "model." + key
+        new_state_dict[new_key] = state_dict[key]
+    model.load_state_dict(new_state_dict)
+    model.eval()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    print("Model loaded successfully.")
+    return model, device
+def load_data(device, img_dir="resources/trainB/", image_size=112, batch_size=32):
+    filenames = load_filenames(img_dir)
+    train_X = filenames[:1000]
+    train_ds = MyDataset(train_X, img_dir=img_dir, img_size=image_size)
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=0,
+    )
+    iterator = iter(train_loader)
+    x, _, _ = next(iterator)
+    x = x.to(device)
+    x = x[:,0].to(device)
+    print("Data loaded successfully.")
+    return x
+model, device = load_model()
+image_size = 112
+batch_size = 32
+x = load_data(device)
+# アップロード画像の前処理
+def preprocess_uploaded_image(uploaded_image, image_size):
+    uploaded_image = Image.fromarray(uploaded_image)
+    uploaded_image = uploaded_image.convert("RGB")
+    uploaded_image = uploaded_image.resize((image_size, image_size))
+    uploaded_image = np.array(uploaded_image).transpose(2, 0, 1) / 255.0
+    uploaded_image = torch.tensor(uploaded_image, dtype=torch.float32).unsqueeze(0).to(device)
+    return uploaded_image
+# ヒートマップの生成関数
+@spaces.GPU
+def get_heatmaps(source_num, x_coords, y_coords, uploaded_image):
+    with torch.no_grad():
+        dec5, _ = model(x)
+        img = x
+        feature_map = dec5
+        batch_size = feature_map.size(0)
+        feature_dim = feature_map.size(1)
+        # アップロード画像の前処理
+        if uploaded_image is not None:
+            uploaded_image = preprocess_uploaded_image(uploaded_image, image_size)
+            target_feature_map, _ = model(uploaded_image)
+            img = torch.cat((img, uploaded_image))
+            feature_map = torch.cat((feature_map, target_feature_map))
+            batch_size += 1
+        else:
+            uploaded_image = torch.zeros(1, 3, image_size, image_size, device=device)
+        target_num = batch_size - 1
+        x_coords = [x_coords] * batch_size
+        y_coords = [y_coords] * batch_size
+        vectors = feature_map[torch.arange(feature_map.size(0)), :, y_coords, x_coords]
+        vector = vectors[source_num]
+        reshaped_feature_map = feature_map.permute(0, 2, 3, 1).view(feature_map.size(0), -1, feature_dim)
+        batch_distance_map = F.pairwise_distance(reshaped_feature_map, vector).view(feature_map.size(0), image_size, image_size)
+        norm_batch_distance_map = 1 / torch.cosh(20 * (batch_distance_map - batch_distance_map.min()) / (batch_distance_map.max() - batch_distance_map.min())) ** 2
+        source_map = norm_batch_distance_map[source_num]
+        target_map = norm_batch_distance_map[target_num]
+        alpha = 0.8
+        blended_source = (1 - alpha) * img[source_num] + alpha * torch.cat(((norm_batch_distance_map[source_num] / norm_batch_distance_map[source_num].max()).unsqueeze(0), torch.zeros(2, image_size, image_size, device=device)))
+        blended_target = (1 - alpha) * img[target_num] + alpha * torch.cat(((norm_batch_distance_map[target_num] / norm_batch_distance_map[target_num].max()).unsqueeze(0), torch.zeros(2, image_size, image_size, device=device)))
+        # Matplotlibでプロットして画像として保存
+        fig, axs = plt.subplots(2, 2, figsize=(10, 10))
+        axs[0, 0].imshow(source_map.cpu(), cmap='hot')
+        axs[0, 0].set_title("Source Map")
+        axs[0, 1].imshow(target_map.cpu(), cmap='hot')
+        axs[0, 1].set_title("Target Map")
+        axs[1, 0].imshow(blended_source.permute(1, 2, 0).cpu())
+        axs[1, 0].set_title("Blended Source")
+        axs[1, 1].imshow(blended_target.permute(1, 2, 0).cpu())
+        axs[1, 1].set_title("Blended Target")
+        for ax in axs.flat:
+            ax.axis('off')
+        plt.tight_layout()
+        plt.close(fig)
+        return fig
+def process_image(cropped_image_data):
+    # Base64からPILイメージに変換
+    header, base64_data = cropped_image_data.split(',', 1)
+    image_data = base64.b64decode(base64_data)
+    image = Image.open(BytesIO(image_data))
+    return image
+# JavaScriptコード
+scripts = """
+async () => {
+    const script = document.createElement("script");
+    script.src = "https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.5.13/cropper.min.js";
+    document.head.appendChild(script);
+    const style = document.createElement("link");
+    style.rel = "stylesheet";
+    style.href = "https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.5.13/cropper.min.css";
+    document.head.appendChild(style);
+    script.onload = () => {
+        let cropper;
+        document.getElementById("input_file_button").onclick = function() {
+            document.querySelector("#input_file").click();
+        };
+        // GradioのFileコンポーネントから画像を読み込む
+        document.querySelector("#input_file").addEventListener("change", function(e) {
+            const files = e.target.files;
+            console.log(files);
+            if (files && files.length > 0) {
+                console.log("File selected");
+                document.querySelector("#crop_view").style.display = "block";
+                document.querySelector("#crop_button").style.display = "block";
+                const url = URL.createObjectURL(files[0]);
+                const crop_view = document.getElementById("crop_view");
+                crop_view.src = url;
+                if (cropper) {
+                    cropper.destroy();
+                }
+                cropper = new Cropper(crop_view, {
+                    aspectRatio: 1,
+                    viewMode: 1,
+                });
+            }
+        });
+        // GradioボタンにJavaScriptの機能を追加
+        document.getElementById("crop_button").onclick = function() {
+            if (cropper) {
+                const canvas = cropper.getCroppedCanvas();
+                const croppedImageData = canvas.toDataURL();
+                // Gradioにクロップ画像を送信
+                const textbox = document.querySelector("#cropped_image_data textarea");
+                textbox.value = croppedImageData;
+                textbox.dispatchEvent(new Event("input", { bubbles: true }));
+                document.getElementById("crop_view").style.display = "none";
+                document.getElementById("crop_button").style.display = "none";
+                cropper.destroy();
+            }
+        };
+        document.getElementById("crop_view").style.display = "none";
+        document.getElementById("crop_button").style.display = "none";
+    };
+}
+"""
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            source_num = gr.Slider(0, batch_size - 1, step=1, label="Source Image Index")
+            x_coords = gr.Slider(0, image_size - 1, step=1, value=image_size // 2, label="X Coordinate")
+            y_coords = gr.Slider(0, image_size - 1, step=1, value=image_size // 2, label="Y Coordinate")
+            # GradioのFileコンポーネントでファイル選択ボタンを追加
+            gr.HTML('<input type="file" id="input_file" style="display:none;">')
+            input_file_button = gr.Button("画像を選択", elem_id="input_file_button")
+            # 画像を表示するためのHTML画像タグをGradioで表示
+            gr.HTML('<img id="crop_view" style="max-width:100%;">')
+            # Gradioのボタンコンポーネントを追加し、IDを付与
+            crop_button = gr.Button("クロップ", elem_id="crop_button", variant="primary")
+            # クロップされた画像データのテキストボックス（Base64データ）
+            cropped_image_data = gr.Textbox(visible=False, elem_id="cropped_image_data")
+            input_image = gr.Image(label="Cropped Image", interactive=False)
+            # cropped_image_dataが更新されたらprocess_imageを呼び出す
+            cropped_image_data.change(process_image, inputs=cropped_image_data, outputs=input_image)
+        with gr.Column():
+            output_plot = gr.Plot()
+        # Gradioインターフェースの代わり
+        source_num.change(get_heatmaps, inputs=[source_num, x_coords, y_coords, input_image], outputs=output_plot)
+        x_coords.change(get_heatmaps, inputs=[source_num, x_coords, y_coords, input_image], outputs=output_plot)
+        y_coords.change(get_heatmaps, inputs=[source_num, x_coords, y_coords, input_image], outputs=output_plot)
+        input_image.change(get_heatmaps, inputs=[source_num, x_coords, y_coords, input_image], outputs=output_plot)
+        # JavaScriptコードをロード
+        demo.load(None, None, None, js=scripts)
+    demo.launch()

checkpoints/ae_model_tf_2024-03-05_00-35-21.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b020cb89ad2ccf7a7bf654d86fb975793cbe168bf73cd011e93cf22f63204c
+size 2629576

checkpoints/autoencoder-epoch=09-train_loss=1.00.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af08cd5fbb1c832824b7466be4da59d2a40e6e9eef864097514a2806a24bb92b
+size 3046514

checkpoints/autoencoder-epoch=29-train_loss=1.01.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc5dcf07a6a66cd4f0773af5fff29903d5fb9fa340221cd59083462e1ae77b7
+size 3046959

checkpoints/autoencoder-epoch=49-train_loss=1.01.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4f218b652ba63e9891b73efca5faffaabe4692f1b78755860ff46b113d09ecd
+size 3046959

datamodule.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from dataset import MyDataset, load_filenames  # dataset.pyに基づく
+class DataModule(pl.LightningDataModule):
+    def __init__(self, img_dir, batch_size, img_size=112, num_workers=0):
+        super().__init__()
+        self.img_dir = img_dir
+        self.batch_size = batch_size
+        self.img_size = img_size
+        self.num_workers = num_workers
+        self.file_num = 1000 # or 3400
+    def setup(self, stage=None):
+        filenames = load_filenames(self.img_dir)
+        self.train_dataset = MyDataset(filenames[:self.file_num], img_dir=self.img_dir, img_size=self.img_size)
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            persistent_workers=True
+          )

dataset.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torchvision
+from torchvision import transforms
+import random
+from PIL import Image
+import os
+from utils import RandomAffineAndRetMat
+def load_filenames(data_dir):
+  # label_data = pd.read_json(INPUT_DIR+'DataList.json')
+  # label_data = label_data.sort_index()
+  # tmp_points = []
+  # filenames = []
+  # for o in tqdm(label_data.data[0:1000]):
+  #   filenames.append(o['filename'])
+  #   a = o['filename']
+  #   tmps = []
+  #   for i in range(60):
+  #     tmps.append(o['points'][str(i)]['x'])
+  #     tmps.append(o['points'][str(i)]['y'])
+  #   tmp_points.append(tmps) # datanum
+  # filenames = pd.Series(filenames)
+  # filenames = [str(i).zfill(4)+'.jpg' for i in range(3400)]
+  # df_points = pd.DataFrame(tmp_points)
+  # load from data_dir
+  # 画像の拡張子のみ
+  img_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.ppm', '.pgm', '.tif', '.tiff']
+  filenames = [f for f in os.listdir(data_dir) if os.path.splitext(f)[1].lower() in img_exts]
+  return filenames
+class MyDataset:
+  def __init__(self, X, valid=False, img_dir='resources/trainB/', img_size=256):
+    self.X = X
+    self.valid = valid
+    self.img_dir = img_dir
+    self.img_size = img_size
+  def __len__(self):
+    return len(self.X)
+  def __getitem__(self, index):
+      # 画像を読み込んでトランスフォームを適用
+    f = self.img_dir + self.X[index]
+    original_X = Image.open(f)
+    trans = [
+      transforms.ToTensor(),
+      # transforms.Normalize(mean=means, std=stds),
+      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+      transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2, hue=0.15),
+      transforms.RandomGrayscale(0.3),
+    ]
+    transform = transforms.Compose(trans)
+    xlist = []
+    matlist = []
+    is_flip = random.randint(0, 1) # 同じ画像はフリップ
+    for i in range(2):
+      af = RandomAffineAndRetMat(
+        degrees=[-30, 30],
+        translate=(0.1, 0.1), scale=(0.8, 1.2),
+        # fill=(random.random(), random.random(), random.random()),
+        fill=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
+        shear=[-10, 10],
+        interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
+      )
+      X, affine_matrix = af(transforms.Resize(self.img_size)(original_X))
+      # randomflip
+      if is_flip == 1:
+        X = transforms.RandomHorizontalFlip(1.)(X)
+        flip_matrix = torch.tensor([[-1., 0., 0.],
+                                     [0., 1., 0.],
+                                     [0., 0., 1.]])
+        affine_matrix = torch.matmul(flip_matrix, affine_matrix)
+      xlist.append(transform(X))
+      matlist.append(affine_matrix)
+    X = torch.stack(xlist)
+    mat = torch.stack(matlist)
+    return X, mat, f

model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+from torch import nn
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
+        super(ConvBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
+        self.batchnorm = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        return self.relu(self.batchnorm(self.conv(x)))
+class DeconvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, output_padding):
+        super(DeconvBlock, self).__init__()
+        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride, padding, output_padding)
+        self.batchnorm = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        return self.relu(self.batchnorm(self.deconv(x)))
+class Autoencoder(nn.Module):
+    def __init__(self, feature_dim=32):
+        super(Autoencoder, self).__init__()
+        self.feature_dim = feature_dim
+        # エンコーダ
+        self.enc1 = ConvBlock(3, 16, 10, 1, 0)
+        self.enc2 = ConvBlock(16, 32, 10, 1, 0)
+        self.enc3 = ConvBlock(32, 64, 2, 2, 0)
+        self.enc4 = ConvBlock(64, 128, 2, 2, 0)
+        self.enc5 = ConvBlock(128, 256, 2, 2, 0)
+        # デコーダ
+        self.dec1 = DeconvBlock(256, 128, 2, 2, 0, 1)
+        self.dec2 = DeconvBlock(256, 64, 2, 2, 0, 1) # 128 + 128
+        self.dec3 = DeconvBlock(128, 32, 2, 2, 0, 0) # 64 + 64
+        self.dec4 = DeconvBlock(64, 16, 10, 1, 0, 0) # 32 + 32
+        self.dec5 = DeconvBlock(32, self.feature_dim, 10, 1, 0, 0)
+        self.dec6 = nn.Conv2d(self.feature_dim, 32, 1, 1, 0)
+        self.dec7 = nn.Conv2d(32, 3, 1, 1, 0)
+    def forward(self, x):
+        # エンコーダ
+        enc1 = self.enc1(x)
+        enc2 = self.enc2(enc1)
+        enc3 = self.enc3(enc2)
+        enc4 = self.enc4(enc3)
+        enc5 = self.enc5(enc4)
+        # デコーダ
+        dec1 = self.dec1(enc5)
+        dec2 = self.dec2(torch.cat((dec1, enc4), 1))
+        dec3 = self.dec3(torch.cat((dec2, enc3), 1))
+        dec4 = self.dec4(torch.cat((dec3, enc2), 1))
+        dec5 = self.dec5(torch.cat((dec4, enc1), 1))
+        dec6 = self.dec6(dec5)
+        dec7 = self.dec7(dec6)
+        return dec5, dec7

model_module.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import pytorch_lightning as pl
+import torch
+from torch import nn
+from torch.optim import SGD
+from torchvision.utils import save_image
+import os
+from utils import TripletLossBatch, pairwise_distance_squared, GetTransformedCoords, DistanceMapLogger
+from model import Autoencoder
+class AutoencoderModule(pl.LightningModule):
+    def __init__(self, feature_dim=64, learning_rate=0.1, lambda_c=0.97, initial_margin=1.0, initial_threshold=2.0, save_interval=100, output_dir="output_images"):
+        super(AutoencoderModule, self).__init__()
+        self.feature_dim = feature_dim
+        self.learning_rate = learning_rate
+        self.lambda_c = lambda_c
+        self.margin_img = initial_margin
+        self.margin_img_init = initial_margin
+        self.threshold = initial_threshold
+        self.model = Autoencoder(self.feature_dim)
+        self.criterion = nn.MSELoss()
+        self.triplet_loss = TripletLossBatch()
+        self.losses = []
+        self.save_interval = save_interval  # バッチごとの出力間隔
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+    def forward(self, x):
+        return self.model(x)
+    def training_step(self, batch, batch_idx):
+        img, mat, _ = batch
+        batch_size, _, _, size, size = img.shape
+        img = img.view(batch_size*2, 3, size, size)
+        mat = mat.view(batch_size*2, 3, 3)
+        dec5_output, output = self.model(img)
+        mse_loss = self.criterion(output, img)
+        # 画像内方向の処理
+        num_anchor_sets = 2**12
+        trip_loss = 0
+        std_list = [2.5*1.025**self.current_epoch, 5*1.025**self.current_epoch]
+        for c in std_list:
+            std = size / c
+            anchors = torch.randint(0, size, (batch_size*2, num_anchor_sets, 1, 2))
+            coords = anchors + torch.normal(0, std, (batch_size*2, num_anchor_sets, 2, 2)).long()
+            valid_coords_idx = (((coords >= 0) & (coords < size)).sum(3) == 2).sum(2) != 2
+            coords[valid_coords_idx] = 0
+            anchors[valid_coords_idx] = 0
+            # 最も近い座標の選択
+            d = pairwise_distance_squared(anchors.float(), coords.float())
+            idx = torch.argmin(d, dim=2)
+            anchors, positives, negatives = self._get_triplet_coordinates(anchors, coords, idx)
+            # dec5_outputから特徴ベクトルを抽出
+            anchor_vectors, positive_vectors, negative_vectors = self._extract_feature_vectors(dec5_output, batch_size, anchors, positives, negatives)
+            trip_loss += self.triplet_loss(anchor_vectors, positive_vectors, negative_vectors, self.margin_img)
+        trip_loss /= len(std_list)
+        self.margin_img = self.margin_img_init + self.margin_img - trip_loss.detach()
+        # 変形の学習
+        num_samples = 2**20
+        tf_loss = self._compute_transformation_loss(dec5_output, mat, batch_size, size, num_samples)
+        # バッチ方向の処理
+        bat_dist_loss = self._compute_batch_direction_loss(dec5_output, batch_size, size)
+        # 合計損失
+        loss = mse_loss + trip_loss + 0.001 * bat_dist_loss + (0.001 * 1.**self.current_epoch) * tf_loss
+        self.log("train_loss", loss)
+        # VRAM管理
+        del img, output
+        torch.cuda.empty_cache()
+        return loss
+    def _get_triplet_coordinates(self, anchors, coords, idx):
+        anchors = anchors.squeeze(2)
+        positives = coords[torch.arange(coords.size(0))[:, None, None], torch.arange(coords.size(1))[None, :, None], idx[:, :, None]].squeeze(2)
+        negatives = coords[torch.arange(coords.size(0))[:, None, None], torch.arange(coords.size(1))[None, :, None], (1 - idx)[:, :, None]].squeeze(2)
+        return anchors, positives, negatives
+    def _extract_feature_vectors(self, dec5_output, batch_size, anchors, positives, negatives):
+        y_anchors = anchors[:, :, 0].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        x_anchors = anchors[:, :, 1].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        y_positives = positives[:, :, 0].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        x_positives = positives[:, :, 1].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        y_negatives = negatives[:, :, 0].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        x_negatives = negatives[:, :, 1].unsqueeze(2).expand(-1, -1, self.feature_dim)
+        anchor_vectors = dec5_output[torch.arange(batch_size*2)[:, None, None], torch.arange(self.feature_dim), y_anchors, x_anchors]
+        positive_vectors = dec5_output[torch.arange(batch_size*2)[:, None, None], torch.arange(self.feature_dim), y_positives, x_positives]
+        negative_vectors = dec5_output[torch.arange(batch_size*2)[:, None, None], torch.arange(self.feature_dim), y_negatives, x_negatives]
+        return anchor_vectors, positive_vectors, negative_vectors
+    def _compute_transformation_loss(self, dec5_output, mat, batch_size, size, num_samples=2**12):
+        anchor_indices = torch.randint(batch_size, (num_samples, 1), device=self.device).repeat(1, 2).reshape(num_samples*2)
+        coords_x = torch.randint(0, size, (num_samples, 1), dtype=torch.float32, device=self.device).repeat(1, 2).reshape(num_samples*2, 1)
+        coords_y = torch.randint(0, size, (num_samples, 1), dtype=torch.float32, device=self.device).repeat(1, 2).reshape(num_samples*2, 1)
+        anchor_coords = torch.cat((coords_x, coords_y), 1)
+        anchor_mat = mat[anchor_indices]
+        tf_anchor_coords = GetTransformedCoords(anchor_mat, [size/2, size/2])(anchor_coords)
+        anchor_vectors = torch.zeros([num_samples*2, self.feature_dim], device=self.device)
+        inner_idx_flat = ((0 <= tf_anchor_coords[:,0]) & (tf_anchor_coords[:,0] < size)) & ((0 <= tf_anchor_coords[:,1]) & (tf_anchor_coords[:,1] < size))
+        anchor_vectors[inner_idx_flat] = dec5_output[anchor_indices[inner_idx_flat], :, tf_anchor_coords[inner_idx_flat, 0], tf_anchor_coords[inner_idx_flat, 1]]
+        inner_idx_and = inner_idx_flat.view(num_samples, 2).t()[0] & inner_idx_flat.view(num_samples, 2).t()[1]
+        anchor_vectors = anchor_vectors.view(num_samples, 2, self.feature_dim)[inner_idx_and]
+        return pairwise_distance_squared(anchor_vectors[:,0], anchor_vectors[:,1]).mean()
+    def _compute_batch_direction_loss(self, dec5_output, batch_size, size):
+        N = 2**12
+        anchor_indices = torch.randint(0, batch_size, (N,)) * 2 + torch.randint(0, 2, (N,))
+        anchor_coords = torch.randint(0, size, (N, 2))
+        other_indices = torch.randint(0, batch_size-1, (N, 2)) * 2 + torch.randint(0, 2, (N, 2))
+        other_indices += (other_indices >= anchor_indices.unsqueeze(1)).long() * 2
+        other_coords = torch.randint(0, size, (N, 2, 2))
+        anchor_vectors = dec5_output[anchor_indices, :, anchor_coords[:, 0], anchor_coords[:, 1]]
+        other_vectors = dec5_output[other_indices, :, other_coords[:, :, 0], other_coords[:, :, 1]]
+        distances = pairwise_distance_squared(anchor_vectors.unsqueeze(1), other_vectors)
+        return distances[distances < self.threshold].sum() / ((distances < self.threshold).sum() + 1e-10)
+    def configure_optimizers(self):
+        optimizer = SGD(self.parameters(), lr=self.learning_rate)
+        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: self.lambda_c**epoch)
+        return [optimizer], [scheduler]
+    # def save_intermediate_image(self, output, epoch):
+    #     save_image(output[:4], os.path.join(self.output_dir, f"epoch_{epoch}_output.png"), nrow=1)
+    #     print(f"Saved intermediate image at epoch {epoch}")
+    # def distance_map(self, _input, feature_map, epoch, x_coords=None, y_coords=None):
+    #     save_path = os.path.join(self.output_dir, f"epoch_{epoch}_distance_map.png")
+    #     DistanceMapLogger()(_input, feature_map, save_path, x_coords, y_coords)
+    def configure_optimizers(self):
+        optimizer = SGD(self.parameters(), lr=self.learning_rate)
+        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: self.lambda_c**epoch)
+        return [optimizer], [scheduler]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.2.0
+torchvision==0.17.0
+torchaudio==2.2.0
+matplotlib==3.9.2
+numpy==1.26.4
+pytorch-lightning==2.4.0
+scikit-learn==1.0.2
+gradio==5.5.0

utils.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torchvision
+from torchvision import transforms
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+class RandomAffineAndRetMat(torch.nn.Module):
+  def __init__(
+      self,
+      degrees,
+      translate=None,
+      scale=None,
+      shear=None,
+      interpolation=torchvision.transforms.InterpolationMode.NEAREST,
+      fill=0,
+      center=None,
+  ):
+    super().__init__()
+    self.degrees = degrees
+    self.translate = translate
+    self.scale = scale
+    self.shear = shear
+    self.interpolation = interpolation
+    self.fill = fill
+    self.center = center
+  def forward(self, img):
+    """
+        img (PIL Image or Tensor): Image to be transformed.
+    Returns:
+        PIL Image or Tensor: Affine transformed image.
+    """
+    fill = self.fill
+    if isinstance(img, Tensor):
+        if isinstance(fill, (int, float)):
+            fill = [float(fill)] * transforms.functional.get_image_num_channels(img)
+        else:
+            fill = [float(f) for f in fill]
+    img_size = transforms.functional.get_image_size(img)
+    ret = transforms.RandomAffine.get_params(self.degrees, self.translate, self.scale, self.shear, img_size)
+    transformed_image = transforms.functional.affine(img, *ret, interpolation=self.interpolation, fill=fill, center=self.center)
+    affine_matrix = self.get_affine_matrix_from_params(ret)
+    return transformed_image, affine_matrix
+  def get_affine_matrix_from_params(self, params):
+    degrees, translate, scale, shear = params
+    degrees = torch.tensor(degrees)
+    shear = torch.tensor(shear)
+    # パラメータを変換行列に変換
+    rotation_matrix = torch.tensor([[torch.cos(torch.deg2rad(degrees)), -torch.sin(torch.deg2rad(degrees)), 0],
+                                    [torch.sin(torch.deg2rad(degrees)), torch.cos(torch.deg2rad(degrees)), 0],
+                                    [0, 0, 1]])
+    translation_matrix = torch.tensor([[1, 0, translate[0]],
+                                       [0, 1, translate[1]],
+                                       [0, 0, 1]]).to(torch.float32)
+    scaling_matrix = torch.tensor([[scale, 0, 0],
+                                   [0, scale, 0],
+                                   [0, 0, 1]])
+    shearing_matrix = torch.tensor([[1, -torch.tan(torch.deg2rad(shear[0])), 0],
+                                    [-torch.tan(torch.deg2rad(shear[1])), 1, 0],
+                                    [0, 0, 1]])
+    # 変換行列を合成
+    affine_matrix = translation_matrix.mm(rotation_matrix).mm(scaling_matrix).mm(shearing_matrix)
+    return affine_matrix
+class GetTransformedCoords(nn.Module):
+  def __init__(self, affine_matrix, center):
+    super().__init__()
+    self.affine_matrix = affine_matrix
+    self.center = center
+  def forward(self, _coords):
+    # coords: like tensor([[43, 26], [44, 27], [45, 28]])
+    center_x, center_y = self.center
+    # 元の座標を中心原点にシフト
+    coords = _coords.clone()
+    coords[:, 0] -= center_x
+    coords[:, 1] -= center_y
+    # 各バッチに対して変換を行う
+    homogeneous_coordinates = torch.cat([coords, torch.ones(coords.shape[0], 1, dtype=torch.float32, device=coords.device)], dim=1)
+    transformed_coordinates = torch.bmm(self.affine_matrix, homogeneous_coordinates.unsqueeze(-1)).squeeze(-1)
+    # 画像の範囲内に収める
+    # transformed_x = max(0, min(width - 1, transformed_coordinates[:, 0]))
+    # transformed_y = max(0, min(height - 1, transformed_coordinates[:, 1]))
+    transformed_x = transformed_coordinates[:, 0]
+    transformed_y = transformed_coordinates[:, 1]
+    transformed_x += center_x
+    transformed_y += center_y
+    return torch.stack([transformed_x, transformed_y]).t().to(torch.long)
+# ルートを取らないpairwise_distanceのバージョン
+def pairwise_distance_squared(a, b):
+    return torch.sum((a - b) ** 2, dim=-1)
+def cosine_similarity(a, b):
+  # ベクトルaとbの内積を計算
+  dot_product = torch.matmul(a, b)
+  # ベクトルaとbのノルム（大きさ）を計算
+  norm_a = torch.sqrt(torch.sum(a ** 2, dim=-1))
+  norm_b = torch.sqrt(torch.sum(b ** 2, dim=-1))
+  # コサイン類似度を計算（内積をノルムの積で割る）
+  return dot_product / (norm_a * norm_b)
+def batch_cosine_similarity(a, b):
+  # ベクトルaとbの内積を計算
+  dot_product = torch.einsum('bnd,bnd->bn', a, b)
+  # ベクトルaとbのノルム（大きさ）を計算
+  norm_a = torch.sqrt(torch.sum(a ** 2, dim=-1))
+  norm_b = torch.sqrt(torch.sum(b ** 2, dim=-1))
+  # コサイン類似度を計算（内積をノルムの積で割る）
+  return dot_product / (norm_a * norm_b)
+class TripletLossBatch(nn.Module):
+  def __init__(self):
+    super(TripletLossBatch, self).__init__()
+  def forward(self, anchor, positive, negative, margin=1.0):
+    distance_positive = F.pairwise_distance(anchor, positive, p=2)
+    distance_negative = F.pairwise_distance(anchor, negative, p=2)
+    losses = torch.relu(distance_positive - distance_negative + margin)
+    return losses.mean()
+class TripletLossCosineSimilarity(nn.Module):
+  def __init__(self):
+    super(TripletLossCosineSimilarity, self).__init__()
+  def forward(self, anchor, positive, negative, margin=1.0):
+    distance_positive = 1 - batch_cosine_similarity(anchor, positive)
+    distance_negative = 1 - batch_cosine_similarity(anchor, negative)
+    losses = torch.relu(distance_positive - distance_negative + margin)
+    return losses.mean()
+def imsave(img):
+  img = torchvision.utils.make_grid(img)
+  img = img / 2 + 0.5
+  npimg = img.detach().cpu().numpy()
+  # plt.imshow(np.transpose(npimg, (1, 2, 0)))
+  # plt.show()
+  # save image
+  npimg = np.transpose(npimg, (1, 2, 0))
+  npimg = npimg * 255
+  npimg = npimg.astype(np.uint8)
+  Image.fromarray(npimg).save('sample.png')
+def norm_img(img):
+  return (img-img.min())/(img.max()-img.min())
+def norm_img2(img):
+  return (img-img.min())/(img.max()-img.min())*255
+class DistanceMapLogger:
+  def __call__(self, img, feature_map, save_path, x_coords=None, y_coords=None):
+    device = feature_map.device
+    batch_size = feature_map.size(0)
+    feature_dim = feature_map.size(1)
+    image_size = feature_map.size(2)
+    if x_coords is None:
+      x_coords = [69]*batch_size
+    if y_coords is None:
+      y_coords = [42]*batch_size
+    # PCAで3次元のマップを抽出
+    pca = PCA(n_components=3)
+    pca_result = pca.fit_transform(feature_map.permute(0,2,3,1).reshape(-1,feature_dim).detach().cpu().numpy())  # PCA を実行
+    reshaped_pca_result = pca_result.reshape(batch_size,image_size,image_size,3) # 3次元に変換（元は1次元）
+    sample_num = 0
+    vectors = feature_map[torch.arange(feature_map.size(0)), :, y_coords, x_coords]  # 1次元ベクトルに合わせてサイズを調整
+    vector = vectors[sample_num]
+    # バッチ内の各特徴マップに対して内積を計算
+    # feature_mapの次元を並べ替えてバッチと高さ・幅を平坦化
+    reshaped_feature_map = feature_map.permute(0, 2, 3, 1).view(feature_map.size(0), -1, feature_dim)
+    batch_distance_map = F.pairwise_distance(reshaped_feature_map, vector).view(feature_map.size(0), image_size, image_size)
+    # batch_distance_map = F.cosine_similarity(reshaped_feature_map, vector.unsqueeze(0).unsqueeze(0).expand(65,size*size,32), dim=2).permute(1, 0).reshape(feature_map.size(0), size, size)
+    norm_batch_distance_map = 1/torch.cosh( 20*(batch_distance_map-batch_distance_map.min())/(batch_distance_map.max()-batch_distance_map.min()) )**2
+    # norm_batch_distance_map[:,0,0] = 0.001
+    # 可視化と保存
+    fig, axes = plt.subplots(5, 4, figsize=(20, 25))
+    for ax in axes.flatten():
+        ax.axis('off')
+    # 余白をなくす
+    plt.subplots_adjust(wspace=0, hspace=0)
+    # 外の余白もなくす
+    plt.subplots_adjust(left=0, right=1, bottom=0, top=1)
+    # 距離マップの可視化
+    for i in range(5):
+      axes[i, 0].imshow(norm_batch_distance_map[i].detach().cpu(), cmap='hot')
+      if i == sample_num:
+        axes[i, 0].scatter(x_coords[i], y_coords[i], c='b', s=7)
+      distance_map = torch.cat(((norm_batch_distance_map[i]/norm_batch_distance_map[i].max()).unsqueeze(0),torch.zeros(2,image_size,image_size,device=device)))
+      alpha = 0.9  # Transparency factor for the heatmap overlay
+      blended_tensor = (1 - alpha) * img[i] + alpha * distance_map
+      axes[i, 1].imshow(norm_img(blended_tensor.permute(1,2,0).detach().cpu()))
+      axes[i, 2].imshow(norm_img(img[i].permute(1,2,0).detach().cpu()))
+      axes[i, 3].imshow(norm_img(reshaped_pca_result[i]))
+    plt.savefig(save_path)
+def get_heatmaps(self, img, feature_map, source_num=0, target_num=1,  x_coords=69, y_coords=42):
+  device = feature_map.device
+  batch_size = feature_map.size(0)
+  feature_dim = feature_map.size(1)
+  image_size = feature_map.size(2)
+  x_coords = [x_coords]*batch_size
+  y_coords = [y_coords]*batch_size
+  vectors = feature_map[torch.arange(feature_map.size(0)), :, y_coords, x_coords]  # 1次元ベクトルに合わせてサイズを調整
+  vector = vectors[source_num]
+  # バッチ内の各特徴マップに対して内積を計算
+  # feature_mapの次元を並べ替えてバッチと高さ・幅を平坦化
+  reshaped_feature_map = feature_map.permute(0, 2, 3, 1).view(feature_map.size(0), -1, feature_dim)
+  batch_distance_map = F.pairwise_distance(reshaped_feature_map, vector).view(feature_map.size(0), image_size, image_size)
+  # batch_distance_map = F.cosine_similarity(reshaped_feature_map, vector.unsqueeze(0).unsqueeze(0).expand(65,size*size,32), dim=2).permute(1, 0).reshape(feature_map.size(0), size, size)
+  norm_batch_distance_map = 1/torch.cosh( 20*(batch_distance_map-batch_distance_map.min())/(batch_distance_map.max()-batch_distance_map.min()) )**2
+  # norm_batch_distance_map[:,0,0] = 0.001
+  source_map = norm_batch_distance_map[source_num]
+  target_map = norm_batch_distance_map[target_num]
+  alpha = 0.9
+  blended_source = (1 - alpha) * img[source_num] + alpha * torch.cat(((norm_batch_distance_map[source_num]/norm_batch_distance_map[source_num].max()).unsqueeze(0),torch.zeros(2,image_size,image_size,device=device)))
+  blended_target = (1 - alpha) * img[target_num] + alpha * torch.cat(((norm_batch_distance_map[target_num]/norm_batch_distance_map[target_num].max()).unsqueeze(0),torch.zeros(2,image_size,image_size,device=device)))
+  return source_map, target_map, blended_source, blended_target