File size: 2,573 Bytes
4343947
 
1801c3b
 
 
 
 
 
 
 
 
 
 
 
 
 
4343947
 
1801c3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4343947
 
1801c3b
 
 
 
 
4343947
1801c3b
 
 
 
 
 
4343947
1801c3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import base64

import cv2
import pandas as pd
from PIL import Image
from tqdm import tqdm

from pipeline.clip_wrapper import MODEL_DIM, ClipWrapper
from pipeline.download_videos import DATA_DIR, REPO_ROOT, VIDEO_DIR

FRAME_EXTRACT_RATE_SECONDS = 5  # Extract a frame every 5 seconds
IMAGES_DIR = DATA_DIR / "images"
DATAFRAME_PATH = DATA_DIR / "dataset.parquet"


def process_videos() -> None:
    IMAGES_DIR.mkdir(exist_ok=True, parents=True)
    (IMAGES_DIR / ".gitignore").write_text("*")
    "Runs clip on video frames, saves results to a parquet file"
    clip_wrapper = ClipWrapper()
    results = []
    for video_path in tqdm(list(VIDEO_DIR.glob("*.mp4")), desc="Processing videos"):
        video_id = video_path.stem
        extracted_images_dir = IMAGES_DIR / video_id
        extracted_images_dir.mkdir(exist_ok=True, parents=True)
        complete_file = extracted_images_dir / "complete"
        if complete_file.exists():
            continue
        for clip_vector, image, timestamp_secs, frame_idx in get_clip_vectors(
            video_path, clip_wrapper
        ):
            image_path = extracted_images_dir / f"{frame_idx}.jpg"
            image.save(image_path)
            with open(image_path, "rb") as f:
                encoded_image = base64.b64encode(f.read()).decode()
            results.append(
                [
                    video_id,
                    frame_idx,
                    timestamp_secs,
                    encoded_image,
                    *clip_vector,
                ]
            )
        complete_file.touch()
    df = pd.DataFrame(
        results,
        columns=["video_id", "frame_idx", "timestamp", "base64_image"]
        + [f"dim_{i}" for i in range(MODEL_DIM)],
    )
    print(f"Saving data to {DATAFRAME_PATH}")
    df.to_parquet(DATAFRAME_PATH, index=False)


def get_clip_vectors(video_path, clip_wrapper):
    cap = cv2.VideoCapture(str(video_path))
    num_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    extract_every_n_frames = FRAME_EXTRACT_RATE_SECONDS * fps
    for frame_idx in tqdm(range(num_video_frames), desc="Running CLIP on video"):
        ret, frame = cap.read()
        if frame_idx % extract_every_n_frames != 0:
            continue
        image = Image.fromarray(frame[..., ::-1])
        clip_vector = clip_wrapper.images2vec([image]).squeeze().numpy()
        timestamp_secs = frame_idx / fps
        yield clip_vector, image, timestamp_secs, frame_idx
    cap.release()


if __name__ == "__main__":
    process_videos()