Jiqing commited on
Commit
7448d65
1 Parent(s): 4b6eacc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -21
README.md CHANGED
@@ -37,13 +37,13 @@ from transformers import AutoProcessor, TvpForVideoGrounding
37
 
38
 
39
  def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
40
- """
41
  Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
42
  Returns:
43
  frames (tensor): decoded frames from the video. Return None if the no
44
  video stream was found.
45
  fps (float): the number of frames per second of the video.
46
- """
47
  fps = float(container.streams.video[0].average_rate)
48
  clip_size = sampling_rate * num_frames / target_fps * fps
49
  delta = max(container.streams.video[0].frames - clip_size, 0)
@@ -65,12 +65,11 @@ def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, targe
65
  frames[frame.pts] = frame
66
  break
67
  frames = [frames[pts] for pts in sorted(frames)]
68
-
69
  return frames, fps
70
 
71
 
72
  def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
73
- """
74
  Decode the video and perform temporal sampling.
75
  Args:
76
  container (container): pyav container.
@@ -84,7 +83,7 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
84
  the target video fps before frame sampling.
85
  Returns:
86
  frames (tensor): decoded frames from the video.
87
- """
88
  assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
89
  frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
90
  clip_size = sampling_rate * num_frames / target_fps * fps
@@ -93,22 +92,19 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
93
  frames = [frames[idx] for idx in index]
94
  frames = [frame.to_rgb().to_ndarray() for frame in frames]
95
  frames = torch.from_numpy(np.stack(frames))
96
-
97
  return frames
98
 
99
  def get_resize_size(image, max_size):
100
- """
101
  Args:
102
  image: np.ndarray
103
  max_size: The max size of height and width
104
-
105
  Returns:
106
  (height, width)
107
  Note the height/width order difference >>> pil_img = Image.open("raw_img_tensor.jpg") >>> pil_img.size (640,
108
  480) # (width, height) >>> np_img = np.array(pil_img) >>> np_img.shape (480, 640, 3) # (height, width, 3)
109
- """
110
  height, width = image.shape[-2:]
111
-
112
  if height >= width:
113
  ratio = width * 1.0 / height
114
  new_height = max_size
@@ -120,32 +116,29 @@ def get_resize_size(image, max_size):
120
  size = {"height": int(new_height), "width": int(new_width)}
121
  return size
122
 
123
- file = hf_hub_download(repo_id="Intel/tvp_demo", filename="3MSZA.mp4", repo_type="dataset")
124
-
125
  model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
126
 
127
  decoder_kwargs = dict(
128
  container=av.open(file, metadata_errors="ignore"),
129
  sampling_rate=1,
130
- num_frames=model.config.num_frm,
131
  clip_idx=0,
132
  num_clips=1,
133
  target_fps=3,
134
  )
135
- raw_sampled_frms = decode(**decoder_kwargs)
136
- raw_sampled_frms = raw_sampled_frms.permute(0, 3, 1, 2)
137
 
138
- text = "person turn a light on."
139
  processor = AutoProcessor.from_pretrained("Intel/tvp-base")
140
  size = get_resize_size(raw_sampled_frms, model.config.max_img_size)
141
- data = processor(
142
  text=[text], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100, size=size
143
  )
144
 
145
- data["pixel_values"] = data["pixel_values"].to(model.dtype)
146
- data["labels"] = torch.tensor([30.96, 24.3, 30.4])
147
- output = model(**data)
148
-
149
  print(f"The model's output is {output}")
150
 
151
  def get_video_duration(filename):
 
37
 
38
 
39
  def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
40
+ '''
41
  Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
42
  Returns:
43
  frames (tensor): decoded frames from the video. Return None if the no
44
  video stream was found.
45
  fps (float): the number of frames per second of the video.
46
+ '''
47
  fps = float(container.streams.video[0].average_rate)
48
  clip_size = sampling_rate * num_frames / target_fps * fps
49
  delta = max(container.streams.video[0].frames - clip_size, 0)
 
65
  frames[frame.pts] = frame
66
  break
67
  frames = [frames[pts] for pts in sorted(frames)]
 
68
  return frames, fps
69
 
70
 
71
  def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
72
+ '''
73
  Decode the video and perform temporal sampling.
74
  Args:
75
  container (container): pyav container.
 
83
  the target video fps before frame sampling.
84
  Returns:
85
  frames (tensor): decoded frames from the video.
86
+ '''
87
  assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
88
  frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
89
  clip_size = sampling_rate * num_frames / target_fps * fps
 
92
  frames = [frames[idx] for idx in index]
93
  frames = [frame.to_rgb().to_ndarray() for frame in frames]
94
  frames = torch.from_numpy(np.stack(frames))
 
95
  return frames
96
 
97
  def get_resize_size(image, max_size):
98
+ '''
99
  Args:
100
  image: np.ndarray
101
  max_size: The max size of height and width
 
102
  Returns:
103
  (height, width)
104
  Note the height/width order difference >>> pil_img = Image.open("raw_img_tensor.jpg") >>> pil_img.size (640,
105
  480) # (width, height) >>> np_img = np.array(pil_img) >>> np_img.shape (480, 640, 3) # (height, width, 3)
106
+ '''
107
  height, width = image.shape[-2:]
 
108
  if height >= width:
109
  ratio = width * 1.0 / height
110
  new_height = max_size
 
116
  size = {"height": int(new_height), "width": int(new_width)}
117
  return size
118
 
119
+ file = hf_hub_download(repo_id="Intel/tvp_demo", filename="AK2KG.mp4", repo_type="dataset")
 
120
  model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
121
 
122
  decoder_kwargs = dict(
123
  container=av.open(file, metadata_errors="ignore"),
124
  sampling_rate=1,
125
+ num_frames=model.config.num_frames,
126
  clip_idx=0,
127
  num_clips=1,
128
  target_fps=3,
129
  )
130
+ raw_sampled_frms = decode(**decoder_kwargs).permute(0, 3, 1, 2)
 
131
 
132
+ text = "a person is sitting on a bed."
133
  processor = AutoProcessor.from_pretrained("Intel/tvp-base")
134
  size = get_resize_size(raw_sampled_frms, model.config.max_img_size)
135
+ model_inputs = processor(
136
  text=[text], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100, size=size
137
  )
138
 
139
+ model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype)
140
+ model_inputs["labels"] = torch.tensor([18.1, 0.0, 6.8])
141
+ output = model(**model_inputs)
 
142
  print(f"The model's output is {output}")
143
 
144
  def get_video_duration(filename):