Spaces:
Runtime error
Runtime error
arjunanand13
commited on
Commit
•
1ec7ea5
1
Parent(s):
19fd223
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ from huggingface_hub import InferenceClient
|
|
25 |
|
26 |
class VideoClassifier:
|
27 |
def __init__(self, no_of_frames, mode='interface',model='gemini'):
|
|
|
28 |
self.no_of_frames = no_of_frames
|
29 |
self.mode = mode
|
30 |
self.model_name = model.strip().lower()
|
@@ -104,6 +105,7 @@ class VideoClassifier:
|
|
104 |
self.llm = HuggingFacePipeline(pipeline=self.generate_text)
|
105 |
|
106 |
def audio_extraction(self,video_input):
|
|
|
107 |
print(f"Processing video: {video_input} with {self.no_of_frames} frames.")
|
108 |
mp4_file = video_input
|
109 |
video_name = mp4_file.split("/")[-1]
|
@@ -117,6 +119,9 @@ class VideoClassifier:
|
|
117 |
result = self.whisper_model.transcribe(audiotrack, fp16=False)
|
118 |
transcript = result["text"]
|
119 |
print("TRANSCRIPT",transcript)
|
|
|
|
|
|
|
120 |
return transcript
|
121 |
|
122 |
def generate_text(self, inputs, parameters=None):
|
@@ -133,7 +138,7 @@ class VideoClassifier:
|
|
133 |
def classify_video(self,video_input):
|
134 |
|
135 |
transcript=self.audio_extraction(video_input)
|
136 |
-
|
137 |
video = cv2.VideoCapture(video_input)
|
138 |
length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
139 |
no_of_frame = int(self.no_of_frames)
|
@@ -158,7 +163,10 @@ class VideoClassifier:
|
|
158 |
print("CAPTIONS", captions)
|
159 |
video.release()
|
160 |
cv2.destroyAllWindows()
|
|
|
|
|
161 |
|
|
|
162 |
main_categories = Path("main_classes.txt").read_text()
|
163 |
main_categories_list = ['Automotive', 'Books and Literature', 'Business and Finance', 'Careers', 'Education','Family and Relationships',
|
164 |
'Fine Art', 'Food & Drink', 'Healthy Living', 'Hobbies & Interests', 'Home & Garden','Medical Health', 'Movies', 'Music and Audio',
|
@@ -303,9 +311,17 @@ class VideoClassifier:
|
|
303 |
second_video = os.path.join(os.path.dirname(__file__), "PersonalFinance_clip.mp4")
|
304 |
|
305 |
# return final_answer, first_video, second_video
|
|
|
|
|
|
|
|
|
|
|
306 |
return final_answer
|
307 |
|
|
|
308 |
def save_model_choice(self,model_name):
|
|
|
|
|
309 |
self.model_name = model_name
|
310 |
if self.model_name=='mistral':
|
311 |
print("Setting up Mistral model for Class Selection")
|
@@ -313,6 +329,9 @@ class VideoClassifier:
|
|
313 |
else :
|
314 |
print("Setting up Gemini model for Class Selection")
|
315 |
self.setup_gemini_model()
|
|
|
|
|
|
|
316 |
return "Model selected: " + model_name
|
317 |
|
318 |
def launch_interface(self):
|
@@ -366,7 +385,8 @@ if __name__ == "__main__":
|
|
366 |
args = parser.parse_args()
|
367 |
|
368 |
vc = VideoClassifier(no_of_frames=args.no_of_frames, mode=args.mode , model=args.model)
|
369 |
-
|
|
|
370 |
if args.mode == 'interface':
|
371 |
vc.launch_interface()
|
372 |
elif args.mode == 'inference' and args.video_path and args.model:
|
|
|
25 |
|
26 |
class VideoClassifier:
|
27 |
def __init__(self, no_of_frames, mode='interface',model='gemini'):
|
28 |
+
|
29 |
self.no_of_frames = no_of_frames
|
30 |
self.mode = mode
|
31 |
self.model_name = model.strip().lower()
|
|
|
105 |
self.llm = HuggingFacePipeline(pipeline=self.generate_text)
|
106 |
|
107 |
def audio_extraction(self,video_input):
|
108 |
+
start_time_audio = time.time()
|
109 |
print(f"Processing video: {video_input} with {self.no_of_frames} frames.")
|
110 |
mp4_file = video_input
|
111 |
video_name = mp4_file.split("/")[-1]
|
|
|
119 |
result = self.whisper_model.transcribe(audiotrack, fp16=False)
|
120 |
transcript = result["text"]
|
121 |
print("TRANSCRIPT",transcript)
|
122 |
+
end_time_audio = time.time()
|
123 |
+
# print("TIME TAKEN FOR AUDIO CONVERSION (WHISPER)",end_time_audio-start_time_audio)
|
124 |
+
|
125 |
return transcript
|
126 |
|
127 |
def generate_text(self, inputs, parameters=None):
|
|
|
138 |
def classify_video(self,video_input):
|
139 |
|
140 |
transcript=self.audio_extraction(video_input)
|
141 |
+
start_time_caption = time.time()
|
142 |
video = cv2.VideoCapture(video_input)
|
143 |
length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
144 |
no_of_frame = int(self.no_of_frames)
|
|
|
163 |
print("CAPTIONS", captions)
|
164 |
video.release()
|
165 |
cv2.destroyAllWindows()
|
166 |
+
end_time_caption = time.time()
|
167 |
+
# print("TIME TAKEN FOR IMAGE CAPTIONING", end_time_caption-start_time_caption)
|
168 |
|
169 |
+
start_time_generation = time.time()
|
170 |
main_categories = Path("main_classes.txt").read_text()
|
171 |
main_categories_list = ['Automotive', 'Books and Literature', 'Business and Finance', 'Careers', 'Education','Family and Relationships',
|
172 |
'Fine Art', 'Food & Drink', 'Healthy Living', 'Hobbies & Interests', 'Home & Garden','Medical Health', 'Movies', 'Music and Audio',
|
|
|
311 |
second_video = os.path.join(os.path.dirname(__file__), "PersonalFinance_clip.mp4")
|
312 |
|
313 |
# return final_answer, first_video, second_video
|
314 |
+
end_time_generation = time.time()
|
315 |
+
print("MODEL SETUP TIME",end_time_setup-start_time_setup)
|
316 |
+
print("TIME TAKEN FOR AUDIO CONVERSION (WHISPER)",end_time_audio-start_time_audio)
|
317 |
+
print("TIME TAKEN FOR IMAGE CAPTIONING", end_time_caption-start_time_caption)
|
318 |
+
print("TIME TAKEN FOR CLASS GENERATION",end_time_generation - start_time_generation)
|
319 |
return final_answer
|
320 |
|
321 |
+
|
322 |
def save_model_choice(self,model_name):
|
323 |
+
start_time_setup = time.time()
|
324 |
+
|
325 |
self.model_name = model_name
|
326 |
if self.model_name=='mistral':
|
327 |
print("Setting up Mistral model for Class Selection")
|
|
|
329 |
else :
|
330 |
print("Setting up Gemini model for Class Selection")
|
331 |
self.setup_gemini_model()
|
332 |
+
end_time_setup = time.time()
|
333 |
+
# print("MODEL SETUP TIME",end_time_setup-start_time_setup)
|
334 |
+
|
335 |
return "Model selected: " + model_name
|
336 |
|
337 |
def launch_interface(self):
|
|
|
385 |
args = parser.parse_args()
|
386 |
|
387 |
vc = VideoClassifier(no_of_frames=args.no_of_frames, mode=args.mode , model=args.model)
|
388 |
+
|
389 |
+
|
390 |
if args.mode == 'interface':
|
391 |
vc.launch_interface()
|
392 |
elif args.mode == 'inference' and args.video_path and args.model:
|