osanseviero HF staff commited on
Commit
442c76a
1 Parent(s): 7913495

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -57
app.py CHANGED
@@ -43,7 +43,7 @@ def get_attention_maps(pixel_values, attentions, nh, out, img_path):
43
  attentions = attentions.detach().numpy()
44
 
45
  # sum all attentions
46
- fname = os.path.join(out, "attn-" + os.path.basename(img_path))
47
  plt.imsave(
48
  fname=fname,
49
  arr=sum(
@@ -51,14 +51,15 @@ def get_attention_maps(pixel_values, attentions, nh, out, img_path):
51
  for i in range(attentions.shape[0])
52
  ),
53
  cmap="inferno",
54
- format="png",
55
  )
56
-
57
 
58
  def inference(inp: str, out: str):
59
  print(f"Generating attention images to {out}")
60
 
61
  # I had to process one at a time since colab was crashing...
 
62
  for img_path in tqdm(sorted(glob.glob(os.path.join(inp, "*.jpg")))):
63
  with open(img_path, "rb") as f:
64
  img = Image.open(f)
@@ -78,71 +79,47 @@ def inference(inp: str, out: str):
78
  attentions = attentions[0, :, 0, 1:].reshape(nh, -1)
79
 
80
  # sum and save attention maps
81
- get_attention_maps(pixel_values, attentions, nh, out, img_path)
82
-
83
- def extract_frames_from_video(inp: str, out: str):
84
- vidcap = cv2.VideoCapture(inp)
85
- fps = vidcap.get(cv2.CAP_PROP_FPS)
86
-
87
- print(f"Video: {inp} ({fps} fps)")
88
- print(f"Extracting frames to {out}")
89
-
90
- success, image = vidcap.read()
91
- count = 0
92
- while success:
93
- cv2.imwrite(
94
- os.path.join(out, f"frame-{count:04}.jpg"),
95
- image,
96
- )
97
- success, image = vidcap.read()
98
- count += 1
99
- return fps
100
-
101
- def generate_video_from_images(inp: str, out_name: str, fps: int):
102
- img_array = []
103
- attention_images_list = sorted(glob.glob(os.path.join(inp, "attn-*.jpg")))
104
-
105
- # Get size of the first image
106
- with open(attention_images_list[0], "rb") as f:
107
- img = Image.open(f)
108
- img = img.convert("RGB")
109
- size = (400, 400)
110
- img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR))
111
-
112
- print(f"Generating video {size} to {out_name}")
113
-
114
- for filename in tqdm(attention_images_list[1:]):
115
- with open(filename, "rb") as f:
116
- img = Image.open(f)
117
- img = img.convert("RGB")
118
- img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR))
119
-
120
- out = cv2.VideoWriter(
121
- out_name,
122
- cv2.VideoWriter_fourcc(*"MP4V"),
123
- fps,
124
- size,
125
- )
126
 
127
- for i in range(len(img_array)):
128
- out.write(img_array[i])
129
- out.release()
130
- print("Done")
131
- return
132
 
133
  def func(video):
 
 
 
 
134
  frames_folder = os.path.join("output", "frames")
135
  attention_folder = os.path.join("output", "attention")
136
 
137
  os.makedirs(frames_folder, exist_ok=True)
138
  os.makedirs(attention_folder, exist_ok=True)
139
 
140
- fps = extract_frames_from_video(video, frames_folder)
 
 
 
 
 
 
 
 
 
 
141
 
142
- inference(frames_folder,attention_folder)
143
- generate_video_from_images(attention_folder, "video.mp4", fps)
144
 
145
- return "video.mp4"
 
 
 
 
 
 
 
 
 
 
146
 
147
 
148
  title = "Interactive demo: DINO"
 
43
  attentions = attentions.detach().numpy()
44
 
45
  # sum all attentions
46
+ fname = os.path.join(out, os.path.basename(img_path))
47
  plt.imsave(
48
  fname=fname,
49
  arr=sum(
 
51
  for i in range(attentions.shape[0])
52
  ),
53
  cmap="inferno",
54
+ format="jpg",
55
  )
56
+ return fname
57
 
58
  def inference(inp: str, out: str):
59
  print(f"Generating attention images to {out}")
60
 
61
  # I had to process one at a time since colab was crashing...
62
+ fnames = []
63
  for img_path in tqdm(sorted(glob.glob(os.path.join(inp, "*.jpg")))):
64
  with open(img_path, "rb") as f:
65
  img = Image.open(f)
 
79
  attentions = attentions[0, :, 0, 1:].reshape(nh, -1)
80
 
81
  # sum and save attention maps
82
+ fnames.append(get_attention_maps(pixel_values, attentions, nh, out, img_path))
83
+ return fnames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
 
 
 
 
85
 
86
  def func(video):
87
+ clip = VideoFileClip(video)
88
+ if clip.duration > 10:
89
+ return 'trim.mp4'
90
+
91
  frames_folder = os.path.join("output", "frames")
92
  attention_folder = os.path.join("output", "attention")
93
 
94
  os.makedirs(frames_folder, exist_ok=True)
95
  os.makedirs(attention_folder, exist_ok=True)
96
 
97
+ vid = VideoFileClip(inp)
98
+ fps = vid.fps
99
+
100
+ print(f"Video: {inp} ({fps} fps)")
101
+ print(f"Extracting frames to {frames_folder}")
102
+
103
+ vid.write_images_sequence(
104
+ os.path.join(frames_folder, "frame-count%03d.jpg"),
105
+ )
106
+
107
+ output_frame_fnames = inference(frames_folder,attention_folder)
108
 
109
+ new_clip = ImageSequenceClip(output_frame_fnames, fps=fps)
110
+ new_clip.write_videofile("my_new_video.mp4")
111
 
112
+ return "my_new_video.mp4"
113
+
114
+ title = "Interactive demo: DINO"
115
+ description = "Demo for Facebook AI's DINO, a new method for self-supervised training of Vision Transformers. Using this method, they are capable of segmenting objects within an image without having ever been trained to do so. This can be observed by displaying the self-attention of the heads from the last layer for the [CLS] token query. This demo uses a ViT-S/8 trained with DINO. To use it, simply upload an image or use the example image below. Results will show up in a few seconds."
116
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.14294'>Emerging Properties in Self-Supervised Vision Transformers</a> | <a href='https://github.com/facebookresearch/dino'>Github Repo</a></p>"
117
+ iface = gr.Interface(fn=func,
118
+ inputs=gr.inputs.Video(type=None),
119
+ outputs="video",
120
+ title=title,
121
+ description=description,
122
+ article=article)
123
 
124
 
125
  title = "Interactive demo: DINO"