Spaces:

Learningbase
/

RAG

Running

App Files Files Community

Hanzo03 commited on 6 days ago

Commit

b28f276

verified ·

1 Parent(s): 08d3193

Update utils/engine.py

Browse files

Files changed (1) hide show

utils/engine.py +8 -6

utils/engine.py CHANGED Viewed

@@ -9,13 +9,13 @@ from typing import Tuple, List
 from utils.config import config, get_logger
 from utils.models import device, clip_processor, clip_model, collection, chroma_client, vlm_model, vlm_tokenizer
 logger = get_logger("Engine")
 def process_and_index_video(video_path: str) -> Tuple[str, List[Image.Image]]:
     if not video_path:
         return "Please upload a video.", []
-    # Strict Cache Cleanup
     if os.path.exists(config.cache_dir):
         logger.info(f"Clearing old cache at {config.cache_dir}...")
         shutil.rmtree(config.cache_dir, ignore_errors=True)
@@ -32,7 +32,6 @@ def process_and_index_video(video_path: str) -> Tuple[str, List[Image.Image]]:
     rgb_first = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
     h, w, c = rgb_first.shape
-    # 🚨 STRICT SSD ALLOCATION
     logger.info(f"Allocating strict Zarr v3 SSD cache at {config.cache_dir}...")
     frame_cache = zarr.create_array(
         config.cache_dir, shape=(0, h, w, c), chunks=(10, h, w, c), dtype='uint8', zarr_format=3
@@ -41,7 +40,6 @@ def process_and_index_video(video_path: str) -> Tuple[str, List[Image.Image]]:
     timestamps, count, frame_idx = [], 0, 0
     while success:
-        # 🚀 SPEED OPTIMIZATION: Only process exact frames needed
         if count % frame_interval == 0:
             rgb_image = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
             frame_cache.append(np.expand_dims(rgb_image, axis=0), axis=0)
@@ -63,7 +61,9 @@ def process_and_index_video(video_path: str) -> Tuple[str, List[Image.Image]]:
         inputs = clip_processor(images=batch_pil, return_tensors="pt").to(device)
         with torch.no_grad():
-            features = clip_model.get_image_features(**inputs)
         normalized = (features / features.norm(p=2, dim=-1, keepdim=True)).cpu().tolist()
         all_embeddings.extend(normalized)
@@ -90,12 +90,14 @@ def ask_video_question(query: str) -> Tuple[str, List[Image.Image]]:
     inputs = clip_processor(text=[query], return_tensors="pt", padding=True).to(device)
     with torch.no_grad():
-        text_features = clip_model.get_text_features(**inputs)
     text_embedding = (text_features / text_features.norm(p=2, dim=-1, keepdim=True)).cpu().tolist()
     results = collection.query(query_embeddings=text_embedding, n_results=3)
-    # Read strictly from SSD
     frame_cache = zarr.open_array(config.cache_dir, mode="r")
     retrieved_images = []

 from utils.config import config, get_logger
 from utils.models import device, clip_processor, clip_model, collection, chroma_client, vlm_model, vlm_tokenizer
 logger = get_logger("Engine")
 def process_and_index_video(video_path: str) -> Tuple[str, List[Image.Image]]:
     if not video_path:
         return "Please upload a video.", []
     if os.path.exists(config.cache_dir):
         logger.info(f"Clearing old cache at {config.cache_dir}...")
         shutil.rmtree(config.cache_dir, ignore_errors=True)
     rgb_first = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
     h, w, c = rgb_first.shape
     logger.info(f"Allocating strict Zarr v3 SSD cache at {config.cache_dir}...")
     frame_cache = zarr.create_array(
         config.cache_dir, shape=(0, h, w, c), chunks=(10, h, w, c), dtype='uint8', zarr_format=3
     timestamps, count, frame_idx = [], 0, 0
     while success:
         if count % frame_interval == 0:
             rgb_image = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
             frame_cache.append(np.expand_dims(rgb_image, axis=0), axis=0)
         inputs = clip_processor(images=batch_pil, return_tensors="pt").to(device)
         with torch.no_grad():
+            # 🚨 BUGFIX: Manually extract and project the vision features
+            vision_outputs = clip_model.vision_model(**inputs)
+            features = clip_model.visual_projection(vision_outputs.pooler_output)
         normalized = (features / features.norm(p=2, dim=-1, keepdim=True)).cpu().tolist()
         all_embeddings.extend(normalized)
     inputs = clip_processor(text=[query], return_tensors="pt", padding=True).to(device)
     with torch.no_grad():
+        # 🚨 BUGFIX: Manually extract and project the text features
+        text_outputs = clip_model.text_model(**inputs)
+        text_features = clip_model.text_projection(text_outputs.pooler_output)
     text_embedding = (text_features / text_features.norm(p=2, dim=-1, keepdim=True)).cpu().tolist()
     results = collection.query(query_embeddings=text_embedding, n_results=3)
     frame_cache = zarr.open_array(config.cache_dir, mode="r")
     retrieved_images = []