JulianPhillips commited on
Commit
5e7e8fb
1 Parent(s): b42f738

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -12
app.py CHANGED
@@ -9,6 +9,11 @@ import requests
9
  from tempfile import NamedTemporaryFile
10
  import gc
11
  import tensorflow_hub as hub
 
 
 
 
 
12
 
13
  # Ensure that Hugging Face uses the appropriate cache directory
14
  os.environ['TRANSFORMERS_CACHE'] = '/app/cache'
@@ -16,13 +21,6 @@ os.environ['HF_HOME'] = '/app/cache'
16
 
17
  movenet_model_path = '/models/movenet/movenet_lightning'
18
 
19
- # Check if the model path exists
20
- if not os.path.exists(movenet_model_path):
21
- # Download the model from TensorFlow Hub
22
- movenet_model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
23
- else:
24
- movenet_model = tf.saved_model.load(movenet_model_path)
25
-
26
  # Keypoint dictionary for reference
27
  KEYPOINT_DICT = {
28
  'nose': 0,
@@ -88,6 +86,13 @@ def process_video():
88
  cap.release()
89
  os.remove(video_path)
90
 
 
 
 
 
 
 
 
91
  # Process each frame with MoveNet (to get 3D keypoints and detect stance)
92
  movenet_results = []
93
  stances = []
@@ -128,11 +133,12 @@ def process_video():
128
 
129
  # Generate captions for all 60 frames using BLIP
130
  captions = []
131
- blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda')
132
  blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
133
 
134
  for frame in frames:
135
- inputs = blip_processor(images=frame, return_tensors="pt").to('cuda')
 
136
  with torch.no_grad():
137
  caption = blip_model.generate(**inputs)
138
  captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
@@ -144,14 +150,15 @@ def process_video():
144
 
145
  # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
146
  clip_results = []
147
- clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda')
148
  clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
149
 
150
  for i, frame in enumerate(frames):
 
151
  stance = stances[i]
152
  prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
153
- text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda')
154
- image_inputs = clip_processor(images=frame, return_tensors="pt").to('cuda')
155
  with torch.no_grad():
156
  image_features = clip_model.get_image_features(**image_inputs)
157
  text_features = clip_model.get_text_features(**text_inputs)
@@ -182,6 +189,7 @@ def process_video():
182
  }
183
  return jsonify(response)
184
  except Exception as e:
 
185
  return jsonify({"error": str(e)}), 500
186
 
187
  if __name__ == '__main__':
 
9
  from tempfile import NamedTemporaryFile
10
  import gc
11
  import tensorflow_hub as hub
12
+ import logging
13
+ from PIL import Image
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.ERROR)
17
 
18
  # Ensure that Hugging Face uses the appropriate cache directory
19
  os.environ['TRANSFORMERS_CACHE'] = '/app/cache'
 
21
 
22
  movenet_model_path = '/models/movenet/movenet_lightning'
23
 
 
 
 
 
 
 
 
24
  # Keypoint dictionary for reference
25
  KEYPOINT_DICT = {
26
  'nose': 0,
 
86
  cap.release()
87
  os.remove(video_path)
88
 
89
+ # Check if the model path exists and load MoveNet model
90
+ if not os.path.exists(movenet_model_path):
91
+ # Download the model from TensorFlow Hub
92
+ movenet_model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
93
+ else:
94
+ movenet_model = tf.saved_model.load(movenet_model_path)
95
+
96
  # Process each frame with MoveNet (to get 3D keypoints and detect stance)
97
  movenet_results = []
98
  stances = []
 
133
 
134
  # Generate captions for all 60 frames using BLIP
135
  captions = []
136
+ blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda' if torch.cuda.is_available() else 'cpu')
137
  blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
138
 
139
  for frame in frames:
140
+ frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Convert frame to PIL image
141
+ inputs = blip_processor(images=frame_pil, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
142
  with torch.no_grad():
143
  caption = blip_model.generate(**inputs)
144
  captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
 
150
 
151
  # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
152
  clip_results = []
153
+ clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda' if torch.cuda.is_available() else 'cpu')
154
  clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
155
 
156
  for i, frame in enumerate(frames):
157
+ frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Convert frame to PIL image
158
  stance = stances[i]
159
  prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
160
+ text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
161
+ image_inputs = clip_processor(images=frame_pil, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
162
  with torch.no_grad():
163
  image_features = clip_model.get_image_features(**image_inputs)
164
  text_features = clip_model.get_text_features(**text_inputs)
 
189
  }
190
  return jsonify(response)
191
  except Exception as e:
192
+ logging.error(str(e))
193
  return jsonify({"error": str(e)}), 500
194
 
195
  if __name__ == '__main__':