Pradheep1647 commited on
Commit
9f703fc
1 Parent(s): c3c9064

ask for the youtube api from the user

Browse files
Files changed (1) hide show
  1. app.py +30 -43
app.py CHANGED
@@ -10,24 +10,11 @@ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
10
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
- import browser_cookie3
14
-
15
- def get_youtube_cookies(browser):
16
- if browser == 'Chrome':
17
- return browser_cookie3.chrome()
18
- elif browser == 'Firefox':
19
- return browser_cookie3.firefox()
20
- elif browser == 'Edge':
21
- return browser_cookie3.edge()
22
- elif browser == 'Brave':
23
- return browser_cookie3.brave()
24
- else:
25
- raise ValueError("Unsupported browser")
26
-
27
- def download_youtube_video(video_url, browser):
28
- cookies = get_youtube_cookies(browser)
29
  ydl_opts = {
30
- 'cookiefile': cookies,
31
  'format': 'bestvideo+bestaudio',
32
  'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
33
  }
@@ -60,7 +47,7 @@ def convert_mp3_to_wav(mp3_path):
60
  def process_text(text):
61
  model_name = "cardiffnlp/twitter-roberta-base-emotion"
62
  emotion_labels = ['anger', 'joy', 'optimism', 'sad']
63
-
64
  tokenizer = AutoTokenizer.from_pretrained(model_name)
65
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
66
 
@@ -68,12 +55,12 @@ def process_text(text):
68
  with torch.no_grad():
69
  outputs = model(**inputs)
70
  logits = outputs.logits
71
-
72
  emotion_probs = torch.softmax(logits, dim=-1).squeeze()
73
  predicted_emotion = emotion_labels[torch.argmax(emotion_probs)]
74
-
75
  emotion_dict = {emotion_labels[i]: emotion_probs[i].item() for i in range(len(emotion_labels))}
76
-
77
  return emotion_dict, predicted_emotion
78
 
79
  def preprocess_frame(frame):
@@ -89,11 +76,11 @@ def generate_caption(pixel_values):
89
  def predict_emotions(caption):
90
  inputs = emotion_tokenizer(caption, return_tensors='pt', truncation=True, padding=True)
91
  outputs = emotion_model(**inputs)
92
-
93
  emotion_probs = torch.softmax(outputs.logits, dim=1)
94
-
95
  predicted_emotions = {label: prob.item() for label, prob in zip(emotion_labels, emotion_probs[0])}
96
-
97
  return predicted_emotions
98
 
99
  caption_model_name = "Salesforce/blip-image-captioning-base"
@@ -104,22 +91,17 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
104
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
105
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
106
 
107
- def analyze_video(video_url, browser):
108
  global output_path
109
  output_path = './'
110
-
111
- video_path = download_youtube_video(video_url, browser)
112
 
 
113
  mp4_path = convert_to_mp4(video_path)
114
-
115
  audio_path = extract_audio_from_video(mp4_path)
116
-
117
  audio_wav_path = convert_mp3_to_wav(audio_path)
118
 
119
  model_whisper = whisper.load_model("base")
120
-
121
  result_whisper = model_whisper.transcribe(audio_wav_path)
122
-
123
  transcript = result_whisper['text']
124
 
125
  emotion_dict_text, predicted_emotion_text = process_text(transcript)
@@ -128,9 +110,7 @@ def analyze_video(video_url, browser):
128
  emotion_vectors_video = []
129
 
130
  video_capture = cv2.VideoCapture(mp4_path)
131
-
132
  total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
133
-
134
  frame_count_video = 0
135
 
136
  while video_capture.isOpened():
@@ -150,21 +130,28 @@ def analyze_video(video_url, browser):
150
  video_capture.release()
151
 
152
  average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
153
-
154
  combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
155
-
156
  final_most_predicted_index = np.argmax(combined_emotion_vector_final)
157
-
158
  final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
159
 
160
  return transcript, predicted_emotion_text, final_most_predicted_emotion
161
 
162
- iface = gr.Interface(fn=analyze_video,
163
- inputs=[gr.Textbox(label="YouTube Video URL"),
164
- gr.Dropdown(label="Select Browser", choices=["Chrome", "Firefox", "Edge", "Brave"])],
165
- outputs=["text", "text", "text"],
166
- title="Multimodal Emotion Recognition",
167
- description="Enter a YouTube Video URL and select your browser to analyze emotions from both audio and visual content.")
 
 
 
 
 
 
 
 
 
 
168
 
169
  if __name__ == "__main__":
170
- iface.launch()
 
10
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
  import cv2
13
+
14
+ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')
15
+
16
+ def download_youtube_video(video_url, api_key):
 
 
 
 
 
 
 
 
 
 
 
 
17
  ydl_opts = {
 
18
  'format': 'bestvideo+bestaudio',
19
  'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
20
  }
 
47
  def process_text(text):
48
  model_name = "cardiffnlp/twitter-roberta-base-emotion"
49
  emotion_labels = ['anger', 'joy', 'optimism', 'sad']
50
+
51
  tokenizer = AutoTokenizer.from_pretrained(model_name)
52
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
53
 
 
55
  with torch.no_grad():
56
  outputs = model(**inputs)
57
  logits = outputs.logits
58
+
59
  emotion_probs = torch.softmax(logits, dim=-1).squeeze()
60
  predicted_emotion = emotion_labels[torch.argmax(emotion_probs)]
61
+
62
  emotion_dict = {emotion_labels[i]: emotion_probs[i].item() for i in range(len(emotion_labels))}
63
+
64
  return emotion_dict, predicted_emotion
65
 
66
  def preprocess_frame(frame):
 
76
  def predict_emotions(caption):
77
  inputs = emotion_tokenizer(caption, return_tensors='pt', truncation=True, padding=True)
78
  outputs = emotion_model(**inputs)
79
+
80
  emotion_probs = torch.softmax(outputs.logits, dim=1)
81
+
82
  predicted_emotions = {label: prob.item() for label, prob in zip(emotion_labels, emotion_probs[0])}
83
+
84
  return predicted_emotions
85
 
86
  caption_model_name = "Salesforce/blip-image-captioning-base"
 
91
  emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
92
  emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
93
 
94
+ def analyze_video(video_url):
95
  global output_path
96
  output_path = './'
 
 
97
 
98
+ video_path = download_youtube_video(video_url, YOUTUBE_API_KEY)
99
  mp4_path = convert_to_mp4(video_path)
 
100
  audio_path = extract_audio_from_video(mp4_path)
 
101
  audio_wav_path = convert_mp3_to_wav(audio_path)
102
 
103
  model_whisper = whisper.load_model("base")
 
104
  result_whisper = model_whisper.transcribe(audio_wav_path)
 
105
  transcript = result_whisper['text']
106
 
107
  emotion_dict_text, predicted_emotion_text = process_text(transcript)
 
110
  emotion_vectors_video = []
111
 
112
  video_capture = cv2.VideoCapture(mp4_path)
 
113
  total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
 
114
  frame_count_video = 0
115
 
116
  while video_capture.isOpened():
 
130
  video_capture.release()
131
 
132
  average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
 
133
  combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
 
134
  final_most_predicted_index = np.argmax(combined_emotion_vector_final)
 
135
  final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
136
 
137
  return transcript, predicted_emotion_text, final_most_predicted_emotion
138
 
139
+ with gr.Blocks() as iface:
140
+ gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload or enter a YouTube Video URL and analyze emotions from both audio and video frames.")
141
+
142
+ with gr.Row():
143
+ video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter video URL here...", interactive=True)
144
+ api_key = gr.Textbox(label="YouTube API Key", placeholder="Enter your API key", type="password", interactive=True)
145
+
146
+ with gr.Row():
147
+ submit_button = gr.Button("Analyze Video")
148
+
149
+ with gr.Row():
150
+ transcript_output = gr.Textbox(label="Transcript", interactive=False)
151
+ audio_emotion_output = gr.Textbox(label="Emotion from Audio", interactive=False)
152
+ visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
153
+
154
+ submit_button.click(analyze_video, inputs=[video_url, api_key], outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
155
 
156
  if __name__ == "__main__":
157
+ iface.launch()