Spaces:

Form-Fighter
/

FormFighterAIStack

Sleeping

App Files Files Community

JulianPhillips commited on Oct 10, 2024

Commit

5e7e8fb

verified ·

1 Parent(s): b42f738

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -12

app.py CHANGED Viewed

@@ -9,6 +9,11 @@ import requests
 from tempfile import NamedTemporaryFile
 import gc
 import tensorflow_hub as hub
 # Ensure that Hugging Face uses the appropriate cache directory
 os.environ['TRANSFORMERS_CACHE'] = '/app/cache'
@@ -16,13 +21,6 @@ os.environ['HF_HOME'] = '/app/cache'
 movenet_model_path = '/models/movenet/movenet_lightning'
-# Check if the model path exists
-if not os.path.exists(movenet_model_path):
-    # Download the model from TensorFlow Hub
-    movenet_model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
-else:
-    movenet_model = tf.saved_model.load(movenet_model_path)
 # Keypoint dictionary for reference
 KEYPOINT_DICT = {
     'nose': 0,
@@ -88,6 +86,13 @@ def process_video():
         cap.release()
         os.remove(video_path)
         # Process each frame with MoveNet (to get 3D keypoints and detect stance)
         movenet_results = []
         stances = []
@@ -128,11 +133,12 @@ def process_video():
         # Generate captions for all 60 frames using BLIP
         captions = []
-        blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda')
         blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
         for frame in frames:
-            inputs = blip_processor(images=frame, return_tensors="pt").to('cuda')
             with torch.no_grad():
                 caption = blip_model.generate(**inputs)
             captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
@@ -144,14 +150,15 @@ def process_video():
         # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
         clip_results = []
-        clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda')
         clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
         for i, frame in enumerate(frames):
             stance = stances[i]
             prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
-            text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda')
-            image_inputs = clip_processor(images=frame, return_tensors="pt").to('cuda')
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**image_inputs)
                 text_features = clip_model.get_text_features(**text_inputs)
@@ -182,6 +189,7 @@ def process_video():
         }
         return jsonify(response)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 if __name__ == '__main__':

 from tempfile import NamedTemporaryFile
 import gc
 import tensorflow_hub as hub
+import logging
+from PIL import Image
+# Configure logging
+logging.basicConfig(level=logging.ERROR)
 # Ensure that Hugging Face uses the appropriate cache directory
 os.environ['TRANSFORMERS_CACHE'] = '/app/cache'
 movenet_model_path = '/models/movenet/movenet_lightning'
 # Keypoint dictionary for reference
 KEYPOINT_DICT = {
     'nose': 0,
         cap.release()
         os.remove(video_path)
+        # Check if the model path exists and load MoveNet model
+        if not os.path.exists(movenet_model_path):
+            # Download the model from TensorFlow Hub
+            movenet_model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
+        else:
+            movenet_model = tf.saved_model.load(movenet_model_path)
         # Process each frame with MoveNet (to get 3D keypoints and detect stance)
         movenet_results = []
         stances = []
         # Generate captions for all 60 frames using BLIP
         captions = []
+        blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda' if torch.cuda.is_available() else 'cpu')
         blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
         for frame in frames:
+            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert frame to PIL image
+            inputs = blip_processor(images=frame_pil, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
             with torch.no_grad():
                 caption = blip_model.generate(**inputs)
             captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
         # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
         clip_results = []
+        clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda' if torch.cuda.is_available() else 'cpu')
         clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
         for i, frame in enumerate(frames):
+            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert frame to PIL image
             stance = stances[i]
             prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
+            text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
+            image_inputs = clip_processor(images=frame_pil, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**image_inputs)
                 text_features = clip_model.get_text_features(**text_inputs)
         }
         return jsonify(response)
     except Exception as e:
+        logging.error(str(e))
         return jsonify({"error": str(e)}), 500
 if __name__ == '__main__':