sanjanatule commited on
Commit
d24b09d
1 Parent(s): 8819719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -2
app.py CHANGED
@@ -5,6 +5,7 @@ from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM,
5
  import torch
6
  from peft import PeftModel
7
  import torch.nn as nn
 
8
 
9
  clip_model_name = "openai/clip-vit-base-patch32"
10
  phi_model_name = "microsoft/phi-2"
@@ -15,6 +16,8 @@ IMAGE_TOKEN_ID = 23893 # token for word comment
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  clip_embed = 768
17
  phi_embed = 2560
 
 
18
 
19
  class SimpleResBlock(nn.Module):
20
  def __init__(self, phi_embed):
@@ -33,8 +36,8 @@ class SimpleResBlock(nn.Module):
33
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
34
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
35
  resblock = SimpleResBlock(phi_embed).to(device)
36
-
37
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
 
38
 
39
  # load weights
40
  model_to_merge = PeftModel.from_pretrained(phi_model,'./model_chkpt/lora_adaptor')
@@ -53,9 +56,19 @@ def model_generate_ans(img,val_q):
53
  clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
54
  val_image_embeds = projection(clip_val_outputs)
55
  val_image_embeds = resblock(val_image_embeds).to(torch.float16)
56
-
57
  img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
58
  img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
61
  val_q_embeds = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
 
5
  import torch
6
  from peft import PeftModel
7
  import torch.nn as nn
8
+ import whisperx
9
 
10
  clip_model_name = "openai/clip-vit-base-patch32"
11
  phi_model_name = "microsoft/phi-2"
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  clip_embed = 768
18
  phi_embed = 2560
19
+ compute_type = "float16"
20
+ audio_batch_size = 16
21
 
22
  class SimpleResBlock(nn.Module):
23
  def __init__(self, phi_embed):
 
36
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
37
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
38
  resblock = SimpleResBlock(phi_embed).to(device)
 
39
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
40
+ audio_model = whisperx.load_model("large-v2", device, compute_type=compute_type)
41
 
42
  # load weights
43
  model_to_merge = PeftModel.from_pretrained(phi_model,'./model_chkpt/lora_adaptor')
 
56
  clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
57
  val_image_embeds = projection(clip_val_outputs)
58
  val_image_embeds = resblock(val_image_embeds).to(torch.float16)
 
59
  img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
60
  img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
61
+
62
+ # audio
63
+ # audio = whisperx.load_audio(audio_file)
64
+ # result = audio_model.transcribe(audio, batch_size=audio_batch_size)
65
+
66
+ # audio_txt = []
67
+ # for s in result["segments"]:
68
+ # audio_txt.append(s['text'])
69
+ # print(s['text'])
70
+
71
+ # audio_text = "".join(audio_txt)
72
 
73
  val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
74
  val_q_embeds = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)