Spaces:

TexR6
/

LeafDoc

Sleeping

App Files Files Community

TexR6 commited on Jan 29, 2023

Commit

36fc972

•

1 Parent(s): 003da28

initial commit

Browse files

Files changed (9) hide show

app.py +154 -0
class_names.ob +0 -0
examples/AppleScab2.JPG +0 -0
examples/PotatoHealthy2.JPG +0 -0
examples/TomatoHealthy2.JPG +0 -0
examples/TomatoYellowCurlVirus6.JPG +0 -0
model.py +134 -0
pytorch_vit_b_16_timm.pth +3 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import gc
+import PIL
+import glob
+import timm
+import torch
+import nopdb
+import pickle
+import torchvision
+import numpy as np
+import gradio as gr
+from torch import nn
+from PIL import Image
+import matplotlib.pyplot as plt
+import IPython.display as ipd
+from typing import Tuple, Dict
+from timeit import default_timer as timer
+from timm.data import resolve_data_config, create_transform
+example_list = [["examples/" + example] for example in os.listdir("examples")]
+vision_transformer_weights = torch.load('pytorch_vit_b_16_timm.pth',
+                                        map_location=torch.device('cpu'))
+vision_transformer = timm.create_model('vit_base_patch16_224', pretrained=False)
+vision_transformer.head = nn.Linear(in_features=768,
+                                    out_features=38)
+vision_transformer.load_state_dict(vision_transformer_weights)
+from torchvision import datasets, transforms
+data_transforms = transforms.Compose([
+    transforms.Resize(size=(256, 256)),
+    transforms.CenterCrop(size=224),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225],)
+])
+def inv_normalize(tensor):
+    """Normalize an image tensor back to the 0-255 range."""
+    tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min()) * (256 - 1e-5)
+    return tensor
+def inv_transform(tensor, normalize=True):
+    """Convert a tensor back to an image."""
+    tensor = inv_normalize(tensor)
+    array = tensor.detach().cpu().numpy()
+    array = array.transpose(1, 2, 0).astype(np.uint8)
+    return PIL.Image.fromarray(array)
+with open('class_names.ob', 'rb') as fp:
+    class_names = pickle.load(fp)
+img = PIL.Image.open('examples/TomatoYellowCurlVirus6.JPG').convert('RGB')
+img_transformed = data_transforms(img)
+def predict_disease(image) -> Tuple[Dict, float]:
+    """Return prediction classes with probabilities for an input image."""
+    input = data_transforms(image)
+    start_time = timer()
+    prediction_dict = {}
+    with torch.inference_mode():
+        [logits] = vision_transformer(input[None])
+        probs = torch.softmax(logits, dim=0)
+        topk_prob, topk_id = torch.topk(probs, 3)
+        for i in range(topk_prob.size(0)):
+            prediction_dict[class_names[topk_id[i]]] = topk_prob[i].item()
+    prediction_time = round(timer() - start_time, 5)
+    return prediction_dict, prediction_time
+def predict_tensor(img_tensor):
+    """Return prediction classes with probabilities for an input image."""
+    with torch.inference_mode():
+        [logits] = vision_transformer(img_tensor[None])
+        probs = torch.softmax(logits, dim=0)
+        topk_prob, topk_id = torch.topk(probs, 3)
+with nopdb.capture_call(vision_transformer.blocks[5].attn.forward) as attn_call:
+    predict_tensor(img_transformed)
+def plot_attention(image, layer_num):
+    """Given an input image, plot the average attention weight given to each image patch by each attention head."""
+    input_data = data_transforms(image)
+    with nopdb.capture_call(vision_transformer.blocks[int(layer_num)-1].attn.forward) as attn_call:
+        predict_tensor(img_transformed)
+    attn = attn_call.locals['attn'][0]
+    with torch.inference_mode():
+        # loop over attention heads
+        attention_block_num = 0
+        for h_weights in attn:
+            h_weights = h_weights.mean(axis=-2)  # average over all attention keys
+            h_weights = h_weights[1:]  # skip the [class] token
+            attention_block_num += 1
+            plot_weights(input_data, h_weights, attention_block_num)
+    attention_maps = glob.glob('storage/*.png')
+    return attention_maps
+def plot_weights(input_data, patch_weights, num_attention_block):
+    """Display the image: Brighter the patch, higher is the attention."""
+    # multiply each patch of the input image by the corresponding weight
+    plot = inv_normalize(input_data.clone())
+    for i in range(patch_weights.shape[0]):
+        x = i * 16 % 224
+        y = i // (224 // 16) * 16
+        plot[:, y:y + 16, x:x + 16] *= patch_weights[i]
+    attn_map_img = inv_transform(plot, normalize=False)
+    attn_map_img = attn_map_img.resize((224, 224), Image.ANTIALIAS)
+    attn_map_img.save(f"storage/attention_map_{num_attention_block}.png", "PNG")
+title_classify = "Image Based Plant Disease Identification 🍃🤓"
+description_classify = """Finetuned a Vision Transformer Base (Patch Size: 16 | Image Size: 224) architecture to
+                  identify the plant disease."""
+article_classify = """Upload an image from the example list or choose one of your own. [Dataset Classes](https://data.mendeley.com/datasets/tywbtsjrjv/1)"""
+title_attention = "Visualize Attention Weights 🧊🔍"
+description_attention = """The Vision Transformer Base architecture has 12 transformer Encoder layers (12 attention heads in each)."""
+article_attention = """From the dropdown menu, choose the Encoder layer whose attention weights you would like to visualize."""
+classify_interface = gr.Interface(
+    fn=predict_disease,
+    inputs=gr.Image(type="pil", label="Image"),
+    outputs=[gr.Label(num_top_classes=3, label="Predictions"),
+              gr.Number(label="Prediction time (secs)")],
+    examples=example_list,
+    title=title_classify,
+    description=description_classify,
+    article=article_classify,
+    thumbnail="https://images.unsplash.com/photo-1470058869958-2a77ade41c02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1170&q=80"
+)
+attention_interface = gr.Interface(
+    fn=plot_attention,
+    inputs=[gr.Image(type="pil", label="Image"),
+            gr.Dropdown(choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
+                        label="Attention Layer", value="6")],
+    outputs=gr.Gallery(value=attention_maps, label="Attention Maps").style(grid=(3, 4)),
+    examples=example_list,
+    title=title_attention,
+    description=description_attention,
+    article=article_attention,
+    thumbnail="https://images.unsplash.com/photo-1470058869958-2a77ade41c02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1170&q=80"
+)
+demo = gr.TabbedInterface([classify_interface, attention_interface],
+                          ["Identify Disease", "Visualize Attention Map"],
+                          title="NatureAI Diagnostics🧑🩺").launch(debug=False, share=True)

class_names.ob ADDED Viewed

Binary file (1.08 kB). View file

examples/AppleScab2.JPG ADDED Viewed

examples/PotatoHealthy2.JPG ADDED Viewed

examples/TomatoHealthy2.JPG ADDED Viewed

examples/TomatoYellowCurlVirus6.JPG ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torchvision
+from torch import nn
+from torchvision import datasets, transforms
+PATCH_SIZE = 16
+class PatchEmbeddings(nn.Module):
+    def __init__(self, in_channels: int=3,
+                 patch_size: int=16,
+                 embedding_dim: int=768):
+        super().__init__()
+        self.generate_patches = nn.Conv2d(in_channels=in_channels,
+                                 out_channels=embedding_dim,
+                                 kernel_size=patch_size,
+                                 stride=patch_size, padding=0)
+        self.flatten = nn.Flatten(start_dim=2, end_dim=3)
+    def forward(self, x: torch.Tensor):
+        image_resolution = x.shape[-1]
+        assert image_resolution % PATCH_SIZE == 0, f"Image size must be divisible by patch size!"
+        return self.flatten(self.generate_patches(x)).permute(0, 2, 1)
+class MultiheadSelfAttention(nn.Module):
+    def __init__(self, embedding_dim: int=768,
+                 num_heads: int=12, attn_dropout: int=0):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
+        self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads,
+                                                    dropout=attn_dropout, batch_first=True)
+    def forward(self, x: torch.Tensor):
+        x = self.layer_norm(x)
+        attn_output, _ = self.multihead_attn(query=x, key=x, value=x,
+                                                        need_weights=False)
+        return attn_output
+class MLPBlock(nn.Module):
+    def __init__(self, embedding_dim: int=768,
+                 mlp_size: int=3072, dropout: int=0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(in_features=embedding_dim,
+                      out_features=mlp_size),
+            nn.GELU(),
+            nn.Dropout(p=dropout),
+            nn.Linear(in_features=mlp_size,
+                      out_features=embedding_dim),
+            nn.Dropout(p=dropout)
+        )
+    def forward(self, x: torch.Tensor):
+        return self.mlp(self.layer_norm(x))
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, embedding_dim: int=768,
+                 mlp_size: int=3072, num_heads: int=12,
+                 mlp_dropout: int=0.1, attn_dropout: int=0):
+        super().__init__()
+        self.msa_block = MultiheadSelfAttention(embedding_dim=embedding_dim,
+                                                num_heads=num_heads, attn_dropout=attn_dropout)
+        self.mlp_block = MLPBlock(embedding_dim=embedding_dim,
+                                  mlp_size=mlp_size, dropout=mlp_dropout)
+    def forward(self, x: torch.Tensor):
+        x = self.msa_block(x) + x
+        x = self.mlp_block(x) + x
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(self, img_size: int=IMG_SIZE,
+                 in_channels: int=3, patch_size: int=16,
+                 num_transformer_layers: int=12, embedding_dim: int=768,
+                 mlp_size: int=3072, num_heads: int=12,
+                 attn_dropout: int=0, mlp_dropout: int=0.1,
+                 embedding_dropout: int=0.1, num_classes: int=38):
+        super().__init__()
+        assert img_size % patch_size == 0, f"Image size must be divisible by patch size!"
+        self.num_patches = (img_size * img_size) // patch_size**2
+        self.class_embedding = nn.Parameter(data=torch.randn(1, 1, embedding_dim),
+                                            requires_grad=True)
+        self.position_embedding = nn.Parameter(data=torch.randn(1, self.num_patches+1, embedding_dim),
+                                               requires_grad=True)
+        self.embedding_dropout = nn.Dropout(p=embedding_dropout)
+        self.patch_embeddings = PatchEmbeddings(in_channels=in_channels,
+                                                patch_size=patch_size, embedding_dim=embedding_dim)
+        self.transformer_encoder = nn.Sequential(*[TransformerEncoderBlock(embedding_dim=embedding_dim,
+                                                                           num_heads=num_heads, mlp_size=mlp_size,
+                                                                           mlp_dropout=mlp_dropout) for _ in range(num_transformer_layers)])
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(normalized_shape=embedding_dim),
+            nn.Linear(in_features=embedding_dim,
+                      out_features=num_classes)
+        )
+    def forward(self, x: torch.Tensor):
+        batch_size = x.shape[0]
+        class_token = self.class_embedding.expand(batch_size, -1, -1)
+        x = self.patch_embeddings(x)
+        x = torch.cat((class_token, x), dim=1)
+        x = self.position_embedding + x
+        x = self.embedding_dropout(x)
+        x = self.transformer_encoder(x)
+        x = self.classifier(x[:, 0])
+        return x
+with open("class_names.ob", "rb") as fp:
+    class_names = pickle.load(fp)
+vision_transformer = VisionTransformer(num_classes=len(class_names))

pytorch_vit_b_16_timm.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b82b34f5fc2aa9be5dac0f146fcccb9589481ee5f93717d83f158695329da181
+size 343366929

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch==1.13.1
+torchvision==0.14.1
+gradio==3.16.2
+timm==0.4.5
+nopdb
+IPython