CiaraRowles commited on
Commit
4a1207e
1 Parent(s): 6b1a06f

TemporalNet2 initial changes

Browse files
README.md CHANGED
@@ -1,3 +1,39 @@
1
  ---
2
  license: openrail
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: openrail
3
+ tags:
4
+ - controlnet
5
+ - stable-diffusion
6
+ - diffusers
7
+ base_model: runwayml/stable-diffusion-v1-5
8
  ---
9
+ Introducing the Beta Version of TemporalNet
10
+
11
+ TemporalNet is a ControlNet model designed to enhance the temporal consistency of generated outputs, as demonstrated in this example: https://twitter.com/CiaraRowles1/status/1637486561917906944. While it does not eliminate all flickering, it significantly reduces it, particularly at higher denoise levels. For optimal results, it is recommended to use TemporalNet in combination with other methods.
12
+
13
+ Instructions for Use:
14
+
15
+ 1) Add the model "diff_control_sd15_temporalnet_fp16.safetensors" to your models folder in the ControlNet extension in Automatic1111's Web UI.
16
+
17
+ 2) Create a folder that contains:
18
+
19
+ - A subfolder named "Input_Images" with the input frames
20
+ - A PNG file called "init.png" that is pre-stylized in your desired style
21
+ - The "temporalvideo.py" script
22
+
23
+ 3) Customize the "temporalvideo.py" script according to your preferences, such as the image resolution, prompt, and control net settings.
24
+
25
+ 4) Launch Automatic1111's Web UI with the --api setting enabled.
26
+
27
+ 5) Execute the Python script.
28
+
29
+ *Please note that the "init.png" image will not significantly influence the style of the output video. Its primary purpose is to prevent a drastic change in aesthetics during the first few frames.*
30
+
31
+ Also, I highly recommend you use this in conjunction with the hed model, the settings are already in the script.
32
+
33
+ ToDo:
34
+
35
+ Write an Extension for the web ui.
36
+
37
+ Write a feature that automatically generates an "init.png" image if none is provided.
38
+
39
+ ̶C̶h̶a̶n̶g̶e̶ ̶t̶h̶e̶ ̶e̶x̶t̶e̶n̶s̶i̶o̶n̶ ̶t̶o̶ ̶.̶s̶a̶f̶e̶t̶e̶n̶s̶o̶r̶s̶ ̶a̶n̶d̶ ̶i̶n̶v̶e̶s̶t̶i̶g̶a̶t̶e̶ ̶c̶o̶m̶p̶r̶e̶s̶s̶i̶o̶n̶.̶
cldm_v15.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 3
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ControlNetModel",
3
+ "_diffusers_version": "0.15.0.dev0",
4
+ "_name_or_path": "./",
5
+ "act_fn": "silu",
6
+ "attention_head_dim": 8,
7
+ "block_out_channels": [
8
+ 320,
9
+ 640,
10
+ 1280,
11
+ 1280
12
+ ],
13
+ "class_embed_type": null,
14
+ "conditioning_embedding_out_channels": [
15
+ 16,
16
+ 32,
17
+ 96,
18
+ 256
19
+ ],
20
+ "controlnet_conditioning_channel_order": "rgb",
21
+ "cross_attention_dim": 768,
22
+ "down_block_types": [
23
+ "CrossAttnDownBlock2D",
24
+ "CrossAttnDownBlock2D",
25
+ "CrossAttnDownBlock2D",
26
+ "DownBlock2D"
27
+ ],
28
+ "downsample_padding": 1,
29
+ "flip_sin_to_cos": true,
30
+ "freq_shift": 0,
31
+ "in_channels": 4,
32
+ "layers_per_block": 2,
33
+ "mid_block_scale_factor": 1,
34
+ "norm_eps": 1e-05,
35
+ "norm_num_groups": 32,
36
+ "num_class_embeds": null,
37
+ "only_cross_attention": false,
38
+ "projection_class_embeddings_input_dim": null,
39
+ "resnet_time_scale_shift": "default",
40
+ "upcast_attention": false,
41
+ "use_linear_projection": false
42
+ }
crop_all_images.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ y_folder = "./Input_Images"
4
+
5
+ def crop_and_save_images(folder_path):
6
+ # Get a list of all files in the folder
7
+ files = os.listdir(folder_path)
8
+
9
+ for file in files:
10
+ # Construct the full file path
11
+ file_path = os.path.join(folder_path, file)
12
+
13
+ # Load the image
14
+ img = cv2.imread(file_path)
15
+
16
+ # Get the dimensions of the image
17
+ h, w = img.shape[:2]
18
+
19
+ # Determine the size of the crop
20
+ crop_size = min(h, w)
21
+
22
+ # Calculate the start coordinates of the crop
23
+ start_y = (h - crop_size) // 2
24
+ start_x = (w - crop_size) // 2
25
+
26
+ # Perform the crop
27
+ img_cropped = img[start_y : start_y + crop_size, start_x : start_x + crop_size]
28
+
29
+ # Save the cropped image, overwriting the original image
30
+ cv2.imwrite(file_path, img_cropped)
31
+
32
+ # Example usage:
33
+ crop_and_save_images(y_folder)
temporalnetversion2.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eea78ec529e554291e5c6f66f5a37c27ae8565594b22da297e28eb0f47ab27
3
+ size 12688113093
temporalnetversion2.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: cldm.cldm.ControlLDM
3
+ params:
4
+ linear_start: 0.00085
5
+ linear_end: 0.0120
6
+ num_timesteps_cond: 1
7
+ log_every_t: 200
8
+ timesteps: 1000
9
+ first_stage_key: "jpg"
10
+ cond_stage_key: "txt"
11
+ control_key: "hint"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ only_mid_control: False
20
+
21
+ control_stage_config:
22
+ target: cldm.cldm.ControlNet
23
+ params:
24
+ image_size: 32 # unused
25
+ in_channels: 4
26
+ hint_channels: 6
27
+ model_channels: 320
28
+ attention_resolutions: [ 4, 2, 1 ]
29
+ num_res_blocks: 2
30
+ channel_mult: [ 1, 2, 4, 4 ]
31
+ num_heads: 8
32
+ use_spatial_transformer: True
33
+ transformer_depth: 1
34
+ context_dim: 768
35
+ use_checkpoint: True
36
+ legacy: False
37
+
38
+ unet_config:
39
+ target: cldm.cldm.ControlledUnetModel
40
+ params:
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_heads: 8
49
+ use_spatial_transformer: True
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: True
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ double_z: true
62
+ z_channels: 4
63
+ resolution: 256
64
+ in_channels: 3
65
+ out_ch: 3
66
+ ch: 128
67
+ ch_mult:
68
+ - 1
69
+ - 2
70
+ - 4
71
+ - 4
72
+ num_res_blocks: 2
73
+ attn_resolutions: []
74
+ dropout: 0.0
75
+ lossconfig:
76
+ target: torch.nn.Identity
77
+
78
+ cond_stage_config:
79
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
temporalvideo.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import requests
4
+ import json
5
+ import cv2
6
+ import numpy as np
7
+ import sys
8
+ import torch
9
+ from PIL import Image
10
+ from pprint import pprint
11
+ import base64
12
+ from io import BytesIO
13
+ import torchvision.transforms.functional as F
14
+ from torchvision.io import read_video, read_image, ImageReadMode
15
+ from torchvision.models.optical_flow import Raft_Large_Weights
16
+ from torchvision.models.optical_flow import raft_large
17
+ from torchvision.io import read_video, read_image, ImageReadMode
18
+ from torchvision.utils import flow_to_image
19
+ import cv2
20
+ from torchvision.io import write_jpeg
21
+ import pickle
22
+
23
+
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ model = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).to(device)
27
+ model = model.eval()
28
+
29
+ # Replace with the actual path to your image file and folder
30
+ x_path = "./init.png"
31
+ y_folder = "./Input_Images"
32
+
33
+ output_folder = "output"
34
+ os.makedirs(output_folder, exist_ok=True)
35
+
36
+ def get_image_paths(folder):
37
+ image_extensions = ("*.jpg", "*.jpeg", "*.png", "*.bmp")
38
+ files = []
39
+ for ext in image_extensions:
40
+ files.extend(glob.glob(os.path.join(folder, ext)))
41
+ return sorted(files)
42
+
43
+ y_paths = get_image_paths(y_folder)
44
+
45
+ def send_request(last_image_path, optical_flow_path,current_image_path):
46
+ url = "http://localhost:7860/sdapi/v1/img2img"
47
+
48
+ with open(last_image_path, "rb") as b:
49
+ last_image_encoded = base64.b64encode(b.read()).decode("utf-8")
50
+
51
+ # Load and process the last image
52
+ last_image = cv2.imread(last_image_path)
53
+ last_image = cv2.cvtColor(last_image, cv2.COLOR_BGR2RGB)
54
+ last_image = cv2.resize(last_image, (512, 512))
55
+
56
+ # Load and process the optical flow image
57
+ flow_image = cv2.imread(optical_flow_path)
58
+ flow_image = cv2.cvtColor(flow_image, cv2.COLOR_BGR2RGB)
59
+
60
+ # Load and process the current image
61
+ with open(current_image_path, "rb") as b:
62
+ current_image = base64.b64encode(b.read()).decode("utf-8")
63
+
64
+
65
+ # Concatenating the three images to make a 6-channel image
66
+ six_channel_image = np.dstack((last_image, flow_image))
67
+
68
+ # Serializing the 6-channel image
69
+ serialized_image = pickle.dumps(six_channel_image)
70
+
71
+ # Encoding the serialized image
72
+ encoded_image = base64.b64encode(serialized_image).decode('utf-8')
73
+
74
+ data = {
75
+ "init_images": [current_image],
76
+ "inpainting_fill": 0,
77
+ "inpaint_full_res": True,
78
+ "inpaint_full_res_padding": 1,
79
+ "inpainting_mask_invert": 1,
80
+ "resize_mode": 0,
81
+ "denoising_strength": 0.4,
82
+ "prompt": "1girl, woman",
83
+ "negative_prompt": "",
84
+ "alwayson_scripts": {
85
+ "ControlNet":{
86
+ "args": [
87
+ {
88
+ "input_image": current_image,
89
+ "module": "hed",
90
+ "model": "control_hed-fp16 [13fee50b]",
91
+ "weight": 0.7,
92
+ "guidance": 1,
93
+ },
94
+ {
95
+ "input_image": encoded_image,
96
+ "model": "temporalnetversion2 [b146ac48]",
97
+ "module": "none",
98
+ "weight": 0.6,
99
+ "guidance": 1,
100
+ },
101
+ {
102
+ "input_image": current_image,
103
+ "model": "control_v11p_sd15_openpose [cab727d4]",
104
+ "module": "openpose_full",
105
+ "weight": 0.7,
106
+ "guidance":1,
107
+ }
108
+
109
+
110
+ ]
111
+ }
112
+ },
113
+ "seed": 4123457655,
114
+ "subseed": -1,
115
+ "subseed_strength": -1,
116
+ "sampler_index": "Euler a",
117
+ "batch_size": 1,
118
+ "n_iter": 1,
119
+ "steps": 20,
120
+ "cfg_scale": 6,
121
+ "width": 512,
122
+ "height": 512,
123
+ "restore_faces": True,
124
+ "include_init_images": True,
125
+ "override_settings": {},
126
+ "override_settings_restore_afterwards": True
127
+ }
128
+ response = requests.post(url, json=data)
129
+ if response.status_code == 200:
130
+ return response.content
131
+ else:
132
+ try:
133
+ error_data = response.json()
134
+ print("Error:")
135
+ print(str(error_data))
136
+
137
+ except json.JSONDecodeError:
138
+ print(f"Error: Unable to parse JSON error data.")
139
+ return None
140
+
141
+
142
+
143
+ def infer(frameA, frameB):
144
+
145
+
146
+ input_frame_1 = read_image(str(frameA), ImageReadMode.RGB)
147
+
148
+ input_frame_2 = read_image(str(frameB), ImageReadMode.RGB)
149
+
150
+
151
+ #img1_batch = torch.stack([frames[0]])
152
+ #img2_batch = torch.stack([frames[1]])
153
+
154
+ img1_batch = torch.stack([input_frame_1])
155
+ img2_batch = torch.stack([input_frame_2])
156
+
157
+
158
+ weights = Raft_Large_Weights.DEFAULT
159
+ transforms = weights.transforms()
160
+
161
+
162
+ def preprocess(img1_batch, img2_batch):
163
+ img1_batch = F.resize(img1_batch, size=[512, 512])
164
+ img2_batch = F.resize(img2_batch, size=[512, 512])
165
+ return transforms(img1_batch, img2_batch)
166
+
167
+
168
+ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
169
+
170
+
171
+ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
172
+
173
+ predicted_flows = list_of_flows[-1]
174
+
175
+
176
+ #flow_imgs = flow_to_image(predicted_flows)
177
+
178
+ #print(flow_imgs)
179
+
180
+ predicted_flow = list_of_flows[-1][0]
181
+ opitcal_flow_path = os.path.join(output_folder, f"flow_{i}.png")
182
+ flow_img = flow_to_image(predicted_flow).to("cpu")
183
+ write_jpeg(flow_img,opitcal_flow_path)
184
+
185
+
186
+ return opitcal_flow_path
187
+
188
+ output_images = []
189
+ output_paths = []
190
+
191
+ # Initialize with the first image path
192
+
193
+ result = x_path
194
+ output_image_path = os.path.join(output_folder, f"output_image_0.png")
195
+
196
+ #with open(output_image_path, "wb") as f:
197
+ # f.write(result)
198
+
199
+ last_image_path = x_path
200
+ for i in range(1, len(y_paths)):
201
+ # Use the last image path and optical flow map to generate the next input
202
+ optical_flow = infer(y_paths[i - 1], y_paths[i])
203
+
204
+ # Modify your send_request to use the last_image_path
205
+ result = send_request(last_image_path, optical_flow, y_paths[i])
206
+ data = json.loads(result)
207
+ encoded_image = data["images"][0]
208
+ output_image_path = os.path.join(output_folder, f"output_image_{i}.png")
209
+ last_image_path = output_image_path
210
+ with open(output_image_path, "wb") as f:
211
+ f.write(base64.b64decode(encoded_image))
212
+ print(f"Written data for frame {i}:")
213
+