File size: 4,632 Bytes
9dc317c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6883ad5
9dc317c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fb7b02
 
9dc317c
1fb7b02
 
9dc317c
 
 
 
 
 
 
 
 
 
 
 
 
 
6883ad5
 
 
9dc317c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import torchvision
import torch
import os
from torch import nn, Tensor
import torch.nn.functional as F
import cv2
from PIL import Image
import numpy as np

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Interpreter(nn.Module):
    def __init__(self, 
                 class_count:int,
                 sample_yolo_output,
                 device,
                ):
        super().__init__()

        c = 32

        self.train()
        self._conv1 = nn.Conv2d(in_channels= 3,   out_channels= 2*c,  kernel_size=5, padding=2)
        self._conv2 = nn.Conv2d(in_channels= 2*c, out_channels= 4*c,  kernel_size=5, padding=2)
        self._conv3 = nn.Conv2d(in_channels= 4*c, out_channels= 8*c,  kernel_size=5, padding=2)
        self._conv4 = nn.Conv2d(in_channels= 8*c, out_channels=16*c,  kernel_size=3, padding=1)
        self._conv5 = nn.Conv2d(in_channels=16*c, out_channels=32*c,  kernel_size=3, padding=1)
        self._conv6 = nn.Conv2d(in_channels=32*c, out_channels=64*c,  kernel_size=3, padding=1)

        self._linear_size = self.calc_linear(sample_yolo_output)
        print(self._linear_size)

        self._fc1 = nn.Linear(self._linear_size,512)
        self._fc2 = nn.Linear(512, class_count)
        
        self.to(device)
        self.device = device
        self.training = True
        self.train()

    def calc_linear(self, sample_yolo_output) -> int:
        x = self.convs(sample_yolo_output.to('cpu'))
        return x.shape[-1]

    def convs(self, x:Tensor) -> Tensor:
        x = F.max_pool2d(F.relu(self._conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self._conv2(x)), (2,2))        
        x = F.max_pool2d(F.relu(self._conv3(x)), (2,2))
        x = F.max_pool2d(F.relu(self._conv4(x)), (2,2))
        x = F.max_pool2d(F.relu(self._conv5(x)), (2,2))
        x = F.max_pool2d(F.relu(self._conv6(x)), (2,2))
        x = torch.flatten(x,1)
        return x
    
    def fc(self, x:Tensor) -> Tensor:
        x = F.relu(self._fc1(x))
        # x = F.relu(self._fc2(x))
        x = self._fc2(x)
        return x

    def forward(self, x:list[Tensor]) -> Tensor:
        x = self.convs(x)
        x = self.fc(x)
        return x

import patchify
from torchvision import transforms

class CNN_Model(nn.Module):
    def __init__(self,
                 image_size: tuple[int,int],
                 interpreter: Interpreter,
    ):
        super().__init__()
        self.device = interpreter.device
        self.image_size = image_size
        self.interpreter = interpreter

    def predict(self, img_path:str) -> Tensor:
        img = cv2.imread(img_path)
        img = Image.fromarray(img)
        img = transforms.ToTensor()(img)
        img = torchvision.transforms.Resize(self.image_size)(img)
        img = img[None]
        img = img.to(self.device)

        preds = self.forward(img)
        _, preds = torch.max(preds,1)
        return preds
    
    def forward(self, x:Tensor) -> Tensor:
        x = self.interpreter(x)
        return x

    def predict_large_image(self, 
                   img: np.ndarray,
                   patch_size:int = 816,
        ) -> Tensor:
        
        L = patch_size
        patches = patchify.patchify(img,(L,L,3),L)
        w,h,_ = patches.shape[:3]
        patches = patches.reshape(w*h,*patches.shape[3:]).transpose((0,3,1,2))

        patches = torch.from_numpy(patches)

        patches = patches.float() / 255
        patches = transforms.Resize(self.image_size)(patches)
        patches = patches.to(self.device)

        preds = self.forward(patches)
        _, preds = torch.max(preds,1)

        ratios = preds
        preds = torch.mode(preds, 0).values
        
        return ratios, preds
    
class_count = 41

def build_interpreter(img_size=(640,640), 
                      device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ) -> Interpreter:
    img_size = list(img_size)

    x = torch.randn([3]+img_size).view([-1,3]+img_size).to(device)
        
    return Interpreter(class_count=class_count, sample_yolo_output=x, device=device)

def build_model(img_size = (640,640),
                device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ) -> CNN_Model:
    return CNN_Model(image_size=img_size, 
                     interpreter=build_interpreter(img_size, device))

if __name__ == "__main__":
    model = build_model(img_size=(320,320))
    DATA_DIR = "data/image/test"
    dir = os.listdir(DATA_DIR)[0]
    img_name = os.listdir(f"{DATA_DIR}/{dir}")[0]
    img_path = f"{DATA_DIR}/{dir}/{img_name}"
    
    out = model.predict_large_image(img_path)
    print(out)