Spaces:
Sleeping
Sleeping
Commit
•
d24b09d
1
Parent(s):
8819719
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM,
|
|
5 |
import torch
|
6 |
from peft import PeftModel
|
7 |
import torch.nn as nn
|
|
|
8 |
|
9 |
clip_model_name = "openai/clip-vit-base-patch32"
|
10 |
phi_model_name = "microsoft/phi-2"
|
@@ -15,6 +16,8 @@ IMAGE_TOKEN_ID = 23893 # token for word comment
|
|
15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
clip_embed = 768
|
17 |
phi_embed = 2560
|
|
|
|
|
18 |
|
19 |
class SimpleResBlock(nn.Module):
|
20 |
def __init__(self, phi_embed):
|
@@ -33,8 +36,8 @@ class SimpleResBlock(nn.Module):
|
|
33 |
clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
|
34 |
projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
|
35 |
resblock = SimpleResBlock(phi_embed).to(device)
|
36 |
-
|
37 |
phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
|
|
|
38 |
|
39 |
# load weights
|
40 |
model_to_merge = PeftModel.from_pretrained(phi_model,'./model_chkpt/lora_adaptor')
|
@@ -53,9 +56,19 @@ def model_generate_ans(img,val_q):
|
|
53 |
clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
|
54 |
val_image_embeds = projection(clip_val_outputs)
|
55 |
val_image_embeds = resblock(val_image_embeds).to(torch.float16)
|
56 |
-
|
57 |
img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
|
58 |
img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
|
61 |
val_q_embeds = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
|
|
|
5 |
import torch
|
6 |
from peft import PeftModel
|
7 |
import torch.nn as nn
|
8 |
+
import whisperx
|
9 |
|
10 |
clip_model_name = "openai/clip-vit-base-patch32"
|
11 |
phi_model_name = "microsoft/phi-2"
|
|
|
16 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
clip_embed = 768
|
18 |
phi_embed = 2560
|
19 |
+
compute_type = "float16"
|
20 |
+
audio_batch_size = 16
|
21 |
|
22 |
class SimpleResBlock(nn.Module):
|
23 |
def __init__(self, phi_embed):
|
|
|
36 |
clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
|
37 |
projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
|
38 |
resblock = SimpleResBlock(phi_embed).to(device)
|
|
|
39 |
phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
|
40 |
+
audio_model = whisperx.load_model("large-v2", device, compute_type=compute_type)
|
41 |
|
42 |
# load weights
|
43 |
model_to_merge = PeftModel.from_pretrained(phi_model,'./model_chkpt/lora_adaptor')
|
|
|
56 |
clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
|
57 |
val_image_embeds = projection(clip_val_outputs)
|
58 |
val_image_embeds = resblock(val_image_embeds).to(torch.float16)
|
|
|
59 |
img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
|
60 |
img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
|
61 |
+
|
62 |
+
# audio
|
63 |
+
# audio = whisperx.load_audio(audio_file)
|
64 |
+
# result = audio_model.transcribe(audio, batch_size=audio_batch_size)
|
65 |
+
|
66 |
+
# audio_txt = []
|
67 |
+
# for s in result["segments"]:
|
68 |
+
# audio_txt.append(s['text'])
|
69 |
+
# print(s['text'])
|
70 |
+
|
71 |
+
# audio_text = "".join(audio_txt)
|
72 |
|
73 |
val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
|
74 |
val_q_embeds = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
|