Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

__pycache__/predict.cpython-311.pyc +0 -0
predict.py +44 -38

__pycache__/predict.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/predict.cpython-311.pyc and b/__pycache__/predict.cpython-311.pyc differ

predict.py CHANGED Viewed

@@ -171,16 +171,10 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
                 predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
-        last_ctx_uint8 = (last_frame * 255).clip(0, 255).astype(np.uint8)  # [64,64,3]
         ens.direct_cache = []
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
-            # Fallback: if prediction is very different from context, blend with context
-            diff = np.abs(frame.astype(np.float32) - last_ctx_uint8.astype(np.float32))
-            mean_diff = diff.mean()
-            if mean_diff > 30:  # very different prediction
-                frame = ((0.5 * frame.astype(np.float32) + 0.5 * last_ctx_uint8.astype(np.float32))).clip(0, 255).astype(np.uint8)
             ens.direct_cache.append(frame)
         result = ens.direct_cache[ens.cache_step]
@@ -196,6 +190,14 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
                 ens.reset_cache()
             return result
         ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
@@ -208,39 +210,43 @@ def predict_next_frame(ens, context_frames: np.ndarray) -> np.ndarray:
             direct_flipped = torch.flip(direct_flipped, dims=[4])
             direct_pred = (direct_orig + direct_flipped) / 2.0
-            # Multi-run AR with noise diversity
-            all_ar_runs = []
-            for noise_std in [0.0, 1.0/255.0, 2.0/255.0]:
-                ar_preds_run = []
-                ctx = context_tensor.clone()
-                ctx_flip = context_flipped.clone()
-                last_t = last_tensor.clone()
-                last_f = last_flipped.clone()
                 for step in range(PRED_FRAMES):
-                    ctx_in = ctx if noise_std == 0 else torch.clamp(ctx + torch.randn_like(ctx) * noise_std, 0, 1)
-                    ctx_flip_in = ctx_flip if noise_std == 0 else torch.clamp(ctx_flip + torch.randn_like(ctx_flip) * noise_std, 0, 1)
-                    ar_orig = _predict_ar_frame(ens.sonic_ar, ctx_in, last_t)
-                    ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip_in, last_f)
-                    ar_flip_back = torch.flip(ar_flip, dims=[3])
-                    ar_frame = (ar_orig + ar_flip_back) / 2.0
-                    ar_preds_run.append(ar_frame)
-                    ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                    ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
-                    ctx = ctx_frames.reshape(1, -1, 64, 64)
-                    last_t = ar_orig
-                    ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
-                    ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
-                    ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
-                    last_f = ar_flip
-                all_ar_runs.append(torch.stack(ar_preds_run, dim=1))
-            ar_pred = sum(all_ar_runs) / len(all_ar_runs)
-            predicted = torch.zeros_like(direct_pred)
-            for step in range(PRED_FRAMES):
-                ar_weight = 0.65 - (step / (PRED_FRAMES - 1)) * 0.3
-                direct_weight = 1.0 - ar_weight
-                predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
         ens.direct_cache = []

                 predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
         ens.direct_cache = []
         for i in range(PRED_FRAMES):
             frame = np.transpose(predicted_np[i], (1, 2, 0))
             frame = (frame * 255).clip(0, 255).astype(np.uint8)
             ens.direct_cache.append(frame)
         result = ens.direct_cache[ens.cache_step]
                 ens.reset_cache()
             return result
+        # Detect scene transitions in context frames
+        scene_transition = False
+        for i in range(len(frames) - 1):
+            diff = np.abs(frames[i].astype(np.float32) - frames[i + 1].astype(np.float32)).mean()
+            if diff > 30.0 / 255.0:  # frames are normalized to 0-1
+                scene_transition = True
+                break
         ens.reset_cache()
         with torch.no_grad():
             context_tensor = torch.from_numpy(context).to(DEVICE)
             direct_flipped = torch.flip(direct_flipped, dims=[4])
             direct_pred = (direct_orig + direct_flipped) / 2.0
+            if scene_transition:
+                # Scene transition: use direct-only (AR produces garbage after scene changes)
+                predicted = direct_pred
+            else:
+                # Normal scene: full AR+direct blend with noise diversity
+                all_ar_runs = []
+                for noise_std in [0.0, 1.0/255.0, 2.0/255.0]:
+                    ar_preds_run = []
+                    ctx = context_tensor.clone()
+                    ctx_flip = context_flipped.clone()
+                    last_t = last_tensor.clone()
+                    last_f = last_flipped.clone()
+                    for step in range(PRED_FRAMES):
+                        ctx_in = ctx if noise_std == 0 else torch.clamp(ctx + torch.randn_like(ctx) * noise_std, 0, 1)
+                        ctx_flip_in = ctx_flip if noise_std == 0 else torch.clamp(ctx_flip + torch.randn_like(ctx_flip) * noise_std, 0, 1)
+                        ar_orig = _predict_ar_frame(ens.sonic_ar, ctx_in, last_t)
+                        ar_flip = _predict_ar_frame(ens.sonic_ar, ctx_flip_in, last_f)
+                        ar_flip_back = torch.flip(ar_flip, dims=[3])
+                        ar_frame = (ar_orig + ar_flip_back) / 2.0
+                        ar_preds_run.append(ar_frame)
+                        ctx_frames = ctx.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                        ctx_frames = torch.cat([ctx_frames[:, 1:], ar_orig.unsqueeze(1)], dim=1)
+                        ctx = ctx_frames.reshape(1, -1, 64, 64)
+                        last_t = ar_orig
+                        ctx_flip_frames = ctx_flip.reshape(1, CONTEXT_FRAMES, 3, 64, 64)
+                        ctx_flip_frames = torch.cat([ctx_flip_frames[:, 1:], ar_flip.unsqueeze(1)], dim=1)
+                        ctx_flip = ctx_flip_frames.reshape(1, -1, 64, 64)
+                        last_f = ar_flip
+                    all_ar_runs.append(torch.stack(ar_preds_run, dim=1))
+                ar_pred = sum(all_ar_runs) / len(all_ar_runs)
+                predicted = torch.zeros_like(direct_pred)
                 for step in range(PRED_FRAMES):
+                    ar_weight = 0.65 - (step / (PRED_FRAMES - 1)) * 0.3
+                    direct_weight = 1.0 - ar_weight
+                    predicted[:, step] = ar_weight * ar_pred[:, step] + direct_weight * direct_pred[:, step]
         predicted_np = predicted[0].cpu().numpy()
         ens.direct_cache = []