Spaces:

Form-Fighter
/

FormFighterAIStack

Sleeping

App Files Files Community

JulianPhillips commited on Oct 10, 2024

Commit

b42f738

verified ·

1 Parent(s): e4107bd

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -21

app.py CHANGED Viewed

@@ -23,14 +23,6 @@ if not os.path.exists(movenet_model_path):
 else:
     movenet_model = tf.saved_model.load(movenet_model_path)
-# Load BLIP model
-blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
-blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
-# Load CLIP model
-clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
-clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
 # Keypoint dictionary for reference
 KEYPOINT_DICT = {
     'nose': 0,
@@ -60,13 +52,13 @@ def process_video():
         # Clear previous cache
         gc.collect()
         torch.cuda.empty_cache()
         # Get the video URL from the request
         video_url = request.json.get('videoURL')
         height =  request.json.get('height')
         weight = request.json.get('weight')
         wingspan = request.json.get('wingspan')
         if not video_url:
             return jsonify({"error": "No video URL provided"}), 400
@@ -99,16 +91,7 @@ def process_video():
         # Process each frame with MoveNet (to get 3D keypoints and detect stance)
         movenet_results = []
         stances = []
-        hip_rotations = []
-        arm_extensions = []
-        stepping_jabs = []
         guard_up = []
-        hand_returned = []
-        hips_width_apart = []
-        leg_angle_correct = []
-        punch_started = False
-        initial_left_wrist = None
-        initial_right_wrist = None
         for frame_index, frame in enumerate(frames):
             input_tensor = tf.image.resize_with_pad(tf.convert_to_tensor(frame, dtype=tf.uint8), 256, 256)
@@ -139,26 +122,46 @@ def process_video():
             right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
             guard_up.append(left_hand_near_head and right_hand_near_head)
         # Generate captions for all 60 frames using BLIP
         captions = []
         for frame in frames:
-            inputs = blip_processor(images=frame, return_tensors="pt")
             with torch.no_grad():
                 caption = blip_model.generate(**inputs)
             captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
         # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
         clip_results = []
         for i, frame in enumerate(frames):
             stance = stances[i]
             prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
-            text_inputs = clip_processor(text=[prompt], return_tensors="pt")
-            image_inputs = clip_processor(images=frame, return_tensors="pt")
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**image_inputs)
                 text_features = clip_model.get_text_features(**text_inputs)
                 similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
             clip_results.append(similarity.item())
         # Calculate score based on CLIP results and BLIP captions
         avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0

 else:
     movenet_model = tf.saved_model.load(movenet_model_path)
 # Keypoint dictionary for reference
 KEYPOINT_DICT = {
     'nose': 0,
         # Clear previous cache
         gc.collect()
         torch.cuda.empty_cache()
         # Get the video URL from the request
         video_url = request.json.get('videoURL')
         height =  request.json.get('height')
         weight = request.json.get('weight')
         wingspan = request.json.get('wingspan')
         if not video_url:
             return jsonify({"error": "No video URL provided"}), 400
         # Process each frame with MoveNet (to get 3D keypoints and detect stance)
         movenet_results = []
         stances = []
         guard_up = []
         for frame_index, frame in enumerate(frames):
             input_tensor = tf.image.resize_with_pad(tf.convert_to_tensor(frame, dtype=tf.uint8), 256, 256)
             right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
             guard_up.append(left_hand_near_head and right_hand_near_head)
+        # Free up memory used by MoveNet
+        del movenet_model
+        gc.collect()
         # Generate captions for all 60 frames using BLIP
         captions = []
+        blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda')
+        blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
         for frame in frames:
+            inputs = blip_processor(images=frame, return_tensors="pt").to('cuda')
             with torch.no_grad():
                 caption = blip_model.generate(**inputs)
             captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
+        # Free up memory used by BLIP
+        del blip_model, blip_processor
+        torch.cuda.empty_cache()
+        gc.collect()
         # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
         clip_results = []
+        clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda')
+        clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
         for i, frame in enumerate(frames):
             stance = stances[i]
             prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
+            text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda')
+            image_inputs = clip_processor(images=frame, return_tensors="pt").to('cuda')
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**image_inputs)
                 text_features = clip_model.get_text_features(**text_inputs)
                 similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
             clip_results.append(similarity.item())
+        # Free up memory used by CLIP
+        del clip_model, clip_processor
+        torch.cuda.empty_cache()
+        gc.collect()
         # Calculate score based on CLIP results and BLIP captions
         avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0