oshita-n commited on
Commit
ee86500
1 Parent(s): b1a1700
Files changed (2) hide show
  1. app.py +44 -14
  2. requirements.txt +3 -1
app.py CHANGED
@@ -3,22 +3,56 @@ import numpy as np
3
  import gradio as gr
4
  from lavis.models import load_model_and_preprocess
5
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def process(input_image, prompt):
8
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9
 
10
- model, vis_processors, txt_processors = load_model_and_preprocess(name="blip_feature_extractor", model_type="base", is_eval=True, device=device)
11
-
12
- input_image = input_image.resize((256, 256), Image.LANCZOS)
13
  image = vis_processors["eval"](input_image).unsqueeze(0).to(device)
14
  text_input = txt_processors["eval"](prompt)
15
  sample = {"image": image, "text_input": [text_input]}
16
 
17
- features_multimodal = model.extract_features(sample, mode="multimodal")
18
- preds = features_multimodal.multimodal_embeds.squeeze().detach().cpu().numpy()
19
- preds = np.where(preds > 0.3, 255, 0).astype(np.uint8)
20
- preds = Image.fromarray(preds.astype(np.uint8))
21
- preds = np.array(preds.resize((input_image.width, input_image.height)))
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  return preds
24
 
@@ -29,11 +63,7 @@ if __name__ == '__main__':
29
  input_image, prompt
30
  ]
31
  outputs = "image"
32
- input_size = (256, 256)
33
- output_size = (256, 256)
34
  iface = gr.Interface(fn=process,
35
  inputs=ips,
36
- outputs=outputs,
37
- input_size=input_size,
38
- output_size=output_size)
39
- iface.launch()
 
3
  import gradio as gr
4
  from lavis.models import load_model_and_preprocess
5
  from PIL import Image
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.cm as cm
8
+ import torchvision
9
+
10
+ def create_heatmap(activation_map):
11
+ # アクティベーションマップをnumpy配列に変換
12
+ activation_map_np = activation_map.squeeze().detach().cpu().numpy()
13
+
14
+ # アクティベーションマップの最小値と最大値を取得
15
+ min_value = np.min(activation_map_np)
16
+ max_value = np.max(activation_map_np)
17
+
18
+ # アクティベーションマップを0-1の範囲に正規化
19
+ normalized_map = (activation_map_np - min_value) / (max_value - min_value)
20
+
21
+ # 正規化されたアクティベーションマップをヒートマップに変換
22
+ heatmap = cm.jet(normalized_map)
23
+
24
+ # ヒートマップを [0, 255] の範囲にスケーリングし、uint8型に変換
25
+ heatmap = np.uint8(255 * heatmap)
26
+
27
+ return heatmap
28
 
29
  def process(input_image, prompt):
30
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
 
32
+ model, vis_processors, txt_processors = load_model_and_preprocess(name="pnp_vqa", model_type="base", is_eval=True, device=device)
33
+ input_image = input_image.resize((256, 256))
 
34
  image = vis_processors["eval"](input_image).unsqueeze(0).to(device)
35
  text_input = txt_processors["eval"](prompt)
36
  sample = {"image": image, "text_input": [text_input]}
37
 
38
+ output = model.forward_itm(samples=sample)
39
+ activation_map = output['gradcams'].reshape(24, 24)
40
+
41
+ relu = torch.nn.ReLU()
42
+ # ヒートマップを計算
43
+ heatmap = create_heatmap(activation_map)
44
+
45
+ heatmap = Image.fromarray(heatmap)
46
+ heatmap = torchvision.transforms.functional.to_tensor(heatmap)
47
+ heatmap = relu(heatmap)
48
+ heatmap = torchvision.transforms.functional.to_pil_image(heatmap)
49
+ heatmap = heatmap.resize((256, 256))
50
+ heatmap = np.array(heatmap)
51
+ heatmap = torch.sigmoid(torch.from_numpy(heatmap)).numpy()
52
+ preds = heatmap.reshape(256, 256, -1)
53
+ preds = Image.fromarray(preds.astype(np.uint8)).convert('L')
54
+ preds = np.array(preds)
55
+ preds = np.where(preds > 0.5, 255, 0)
56
 
57
  return preds
58
 
 
63
  input_image, prompt
64
  ]
65
  outputs = "image"
 
 
66
  iface = gr.Interface(fn=process,
67
  inputs=ips,
68
+ outputs=outputs)
69
+ iface.launch()
 
 
requirements.txt CHANGED
@@ -3,4 +3,6 @@ transformers
3
  torch
4
  pillow
5
  salesforce-lavis==1.0.2
6
- numpy
 
 
 
3
  torch
4
  pillow
5
  salesforce-lavis==1.0.2
6
+ numpy
7
+ torchvision
8
+ matplotlib