mmgpt

Sleeping

App Files Files Community

sanjanatule commited on Jan 28

Commit

ea37b8e

•

1 Parent(s): 2b5d0a7

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -13

app.py CHANGED Viewed

@@ -48,6 +48,8 @@ resblock.load_state_dict(torch.load('./model_chkpt/step2_resblock.pth',map_locat
 def model_generate_ans(img=None,img_audio=None,val_q=None):
     max_generate_length = 100
     with torch.no_grad():
         # image
@@ -60,33 +62,38 @@ def model_generate_ans(img=None,img_audio=None,val_q=None):
             img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
             img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
         # audio
         if img_audio is not None:
-            audio_result = audio_model.transcribe(audio)
             audio_text = ''
             for seg in audio_result['segments']:
                 audio_text += seg['text']
             audio_text = audio_text.strip()
             audio_tokens = tokenizer(audio_text, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
             audio_embeds    = merged_model.model.embed_tokens(audio_tokens).unsqueeze(0)
         # text question
-        if val_q is not None:
             val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
             val_q_embeds    = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
-        val_combined_embeds = []
-        if img is not None:
-            #val_combined_embeds = torch.cat([val_combined_embeds, val_image_embeds, img_token_embeds], dim=1)
-            val_combined_embeds.append(val_image_embeds)
-            val_combined_embeds.append(img_token_embeds)
-        if img_audio is not None:
-            #val_combined_embeds = torch.cat([val_combined_embeds, audio_embeds], dim=1)
-            val_combined_embeds.append(audio_embeds)
-        if val_q is not None:
-            #val_combined_embeds = torch.cat([val_combined_embeds, val_q_embeds], dim=1)
             val_combined_embeds.append(val_q_embeds)
         val_combined_embeds = torch.cat(val_combined_embeds,dim=1)
         #val_combined_embeds = torch.cat([val_image_embeds, img_token_embeds, val_q_embeds], dim=1) # 4, 69, 2560

 def model_generate_ans(img=None,img_audio=None,val_q=None):
     max_generate_length = 100
+    val_combined_embeds = []
     with torch.no_grad():
         # image
             img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
             img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
+            val_combined_embeds.append(val_image_embeds)
+            val_combined_embeds.append(img_token_embeds)
         # audio
         if img_audio is not None:
+            audio_result = audio_model.transcribe(img_audio)
             audio_text = ''
             for seg in audio_result['segments']:
                 audio_text += seg['text']
             audio_text = audio_text.strip()
             audio_tokens = tokenizer(audio_text, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
             audio_embeds    = merged_model.model.embed_tokens(audio_tokens).unsqueeze(0)
+            val_combined_embeds.append(audio_embeds)
         # text question
+        if len(val_q) != 0:
             val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
             val_q_embeds    = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
             val_combined_embeds.append(val_q_embeds)
+        # val_combined_embeds = []
+        # if img is not None:
+        #     #val_combined_embeds = torch.cat([val_combined_embeds, val_image_embeds, img_token_embeds], dim=1)
+        #     val_combined_embeds.append(val_image_embeds)
+        #     val_combined_embeds.append(img_token_embeds)
+        # if img_audio is not None:
+        #     #val_combined_embeds = torch.cat([val_combined_embeds, audio_embeds], dim=1)
+        #     val_combined_embeds.append(audio_embeds)
+        # if len(val_q) != 0:
+        #     #val_combined_embeds = torch.cat([val_combined_embeds, val_q_embeds], dim=1)
+        #     val_combined_embeds.append(val_q_embeds)
         val_combined_embeds = torch.cat(val_combined_embeds,dim=1)
         #val_combined_embeds = torch.cat([val_image_embeds, img_token_embeds, val_q_embeds], dim=1) # 4, 69, 2560