arjunanand13 commited on
Commit
1ec7ea5
1 Parent(s): 19fd223

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -2
app.py CHANGED
@@ -25,6 +25,7 @@ from huggingface_hub import InferenceClient
25
 
26
  class VideoClassifier:
27
  def __init__(self, no_of_frames, mode='interface',model='gemini'):
 
28
  self.no_of_frames = no_of_frames
29
  self.mode = mode
30
  self.model_name = model.strip().lower()
@@ -104,6 +105,7 @@ class VideoClassifier:
104
  self.llm = HuggingFacePipeline(pipeline=self.generate_text)
105
 
106
  def audio_extraction(self,video_input):
 
107
  print(f"Processing video: {video_input} with {self.no_of_frames} frames.")
108
  mp4_file = video_input
109
  video_name = mp4_file.split("/")[-1]
@@ -117,6 +119,9 @@ class VideoClassifier:
117
  result = self.whisper_model.transcribe(audiotrack, fp16=False)
118
  transcript = result["text"]
119
  print("TRANSCRIPT",transcript)
 
 
 
120
  return transcript
121
 
122
  def generate_text(self, inputs, parameters=None):
@@ -133,7 +138,7 @@ class VideoClassifier:
133
  def classify_video(self,video_input):
134
 
135
  transcript=self.audio_extraction(video_input)
136
-
137
  video = cv2.VideoCapture(video_input)
138
  length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
139
  no_of_frame = int(self.no_of_frames)
@@ -158,7 +163,10 @@ class VideoClassifier:
158
  print("CAPTIONS", captions)
159
  video.release()
160
  cv2.destroyAllWindows()
 
 
161
 
 
162
  main_categories = Path("main_classes.txt").read_text()
163
  main_categories_list = ['Automotive', 'Books and Literature', 'Business and Finance', 'Careers', 'Education','Family and Relationships',
164
  'Fine Art', 'Food & Drink', 'Healthy Living', 'Hobbies & Interests', 'Home & Garden','Medical Health', 'Movies', 'Music and Audio',
@@ -303,9 +311,17 @@ class VideoClassifier:
303
  second_video = os.path.join(os.path.dirname(__file__), "PersonalFinance_clip.mp4")
304
 
305
  # return final_answer, first_video, second_video
 
 
 
 
 
306
  return final_answer
307
 
 
308
  def save_model_choice(self,model_name):
 
 
309
  self.model_name = model_name
310
  if self.model_name=='mistral':
311
  print("Setting up Mistral model for Class Selection")
@@ -313,6 +329,9 @@ class VideoClassifier:
313
  else :
314
  print("Setting up Gemini model for Class Selection")
315
  self.setup_gemini_model()
 
 
 
316
  return "Model selected: " + model_name
317
 
318
  def launch_interface(self):
@@ -366,7 +385,8 @@ if __name__ == "__main__":
366
  args = parser.parse_args()
367
 
368
  vc = VideoClassifier(no_of_frames=args.no_of_frames, mode=args.mode , model=args.model)
369
-
 
370
  if args.mode == 'interface':
371
  vc.launch_interface()
372
  elif args.mode == 'inference' and args.video_path and args.model:
 
25
 
26
  class VideoClassifier:
27
  def __init__(self, no_of_frames, mode='interface',model='gemini'):
28
+
29
  self.no_of_frames = no_of_frames
30
  self.mode = mode
31
  self.model_name = model.strip().lower()
 
105
  self.llm = HuggingFacePipeline(pipeline=self.generate_text)
106
 
107
  def audio_extraction(self,video_input):
108
+ start_time_audio = time.time()
109
  print(f"Processing video: {video_input} with {self.no_of_frames} frames.")
110
  mp4_file = video_input
111
  video_name = mp4_file.split("/")[-1]
 
119
  result = self.whisper_model.transcribe(audiotrack, fp16=False)
120
  transcript = result["text"]
121
  print("TRANSCRIPT",transcript)
122
+ end_time_audio = time.time()
123
+ # print("TIME TAKEN FOR AUDIO CONVERSION (WHISPER)",end_time_audio-start_time_audio)
124
+
125
  return transcript
126
 
127
  def generate_text(self, inputs, parameters=None):
 
138
  def classify_video(self,video_input):
139
 
140
  transcript=self.audio_extraction(video_input)
141
+ start_time_caption = time.time()
142
  video = cv2.VideoCapture(video_input)
143
  length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
144
  no_of_frame = int(self.no_of_frames)
 
163
  print("CAPTIONS", captions)
164
  video.release()
165
  cv2.destroyAllWindows()
166
+ end_time_caption = time.time()
167
+ # print("TIME TAKEN FOR IMAGE CAPTIONING", end_time_caption-start_time_caption)
168
 
169
+ start_time_generation = time.time()
170
  main_categories = Path("main_classes.txt").read_text()
171
  main_categories_list = ['Automotive', 'Books and Literature', 'Business and Finance', 'Careers', 'Education','Family and Relationships',
172
  'Fine Art', 'Food & Drink', 'Healthy Living', 'Hobbies & Interests', 'Home & Garden','Medical Health', 'Movies', 'Music and Audio',
 
311
  second_video = os.path.join(os.path.dirname(__file__), "PersonalFinance_clip.mp4")
312
 
313
  # return final_answer, first_video, second_video
314
+ end_time_generation = time.time()
315
+ print("MODEL SETUP TIME",end_time_setup-start_time_setup)
316
+ print("TIME TAKEN FOR AUDIO CONVERSION (WHISPER)",end_time_audio-start_time_audio)
317
+ print("TIME TAKEN FOR IMAGE CAPTIONING", end_time_caption-start_time_caption)
318
+ print("TIME TAKEN FOR CLASS GENERATION",end_time_generation - start_time_generation)
319
  return final_answer
320
 
321
+
322
  def save_model_choice(self,model_name):
323
+ start_time_setup = time.time()
324
+
325
  self.model_name = model_name
326
  if self.model_name=='mistral':
327
  print("Setting up Mistral model for Class Selection")
 
329
  else :
330
  print("Setting up Gemini model for Class Selection")
331
  self.setup_gemini_model()
332
+ end_time_setup = time.time()
333
+ # print("MODEL SETUP TIME",end_time_setup-start_time_setup)
334
+
335
  return "Model selected: " + model_name
336
 
337
  def launch_interface(self):
 
385
  args = parser.parse_args()
386
 
387
  vc = VideoClassifier(no_of_frames=args.no_of_frames, mode=args.mode , model=args.model)
388
+
389
+
390
  if args.mode == 'interface':
391
  vc.launch_interface()
392
  elif args.mode == 'inference' and args.video_path and args.model: