File size: 2,284 Bytes
d323598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""Command line interface for generating videos from the model."""
from __future__ import annotations

import argparse
import queue
import threading

import rerun as rr

import vista


def generate_local(
    first_frame_file_name: str,
    height=576,
    width=1024,
    n_rounds=4,
    n_frames=25,
    n_steps=10,
    cfg_scale=2.5,
    cond_aug=0.0,
):
    # Use a queue to log immediately from internals
    log_queue = queue.SimpleQueue()

    handle = threading.Thread(
        target=vista.run_sampling,
        args=[
            log_queue,
            first_frame_file_name,
            height,
            width,
            n_rounds,
            n_frames,
            n_steps,
            cfg_scale,
            cond_aug,
        ],
    )
    handle.start()
    while True:
        msg = log_queue.get()
        if msg == "done":
            break
        else:
            entity_path, entity, times = msg
            rr.reset_time()
            for timeline, time in times:
                if isinstance(time, int):
                    rr.set_time_sequence(timeline, time)
                else:
                    rr.set_time_seconds(timeline, time)
            rr.log(entity_path, entity)
    handle.join()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate video conditioned on a single image using the Vista model."
    )
    parser.add_argument(
        "--img-path",
        type=str,
        help="Path to image used as input for Canny edge detector.",
        default="./example_images/nus-0.jpg",
    )
    parser.add_argument(
        "--num-steps",
        type=int,
        help="Number of diffusion steps per image. Recommended range: 10-50. Higher values result in more detailed images and less blurry results.",
        default=20,
    )
    parser.add_argument(
        "--num-segments",
        type=int,
        help="Number of segments to generate. Each segment consists of 25 frames.",
        default=3,
    )
    rr.script_add_args(parser)
    args = parser.parse_args()
    rr.script_setup(
        args,
        "rerun_example_vista",
        default_blueprint=vista.generate_blueprint(args.num_segments),
    )

    generate_local(args.img_path, n_steps=args.num_steps, n_rounds=args.num_segments)