English
qtnx commited on
Commit
d838dbe
1 Parent(s): f901c2a

Update __main__.py

Browse files
Files changed (1) hide show
  1. __main__.py +71 -51
__main__.py CHANGED
@@ -1,31 +1,32 @@
1
  import argparse
2
- import sys
3
-
4
- import numpy as np
5
  import torch
6
  import torch.nn as nn
7
  from PIL import Image
8
  from transformers import (
9
- AutoModel,
10
- AutoProcessor,
11
  AutoTokenizer,
12
  BitsAndBytesConfig,
13
- LlamaForCausalLM, SiglipImageProcessor, SiglipVisionModel
14
-
 
15
  )
16
  from transformers import TextStreamer
17
 
18
 
19
-
20
- def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
21
- prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
 
22
 
23
  def insert_separator(X, sep):
24
  return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
25
 
26
  input_ids = []
27
  offset = 0
28
- if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
 
 
 
 
29
  offset = 1
30
  input_ids.append(prompt_chunks[0][0])
31
 
@@ -41,7 +42,7 @@ def process_tensors(input_ids, image_features, embedding_layer):
41
 
42
  # Split the input_ids at the index found, excluding -200
43
  input_ids_1 = input_ids[:, :split_index]
44
- input_ids_2 = input_ids[:, split_index + 1:]
45
 
46
  # Convert input_ids to embeddings
47
  embeddings_1 = embedding_layer(input_ids_1)
@@ -57,7 +58,9 @@ def process_tensors(input_ids, image_features, embedding_layer):
57
  )
58
 
59
  # Create the corrected attention mask
60
- attention_mask = torch.ones(concatenated_embeddings.shape[:2], dtype=torch.long, device=device)
 
 
61
  return concatenated_embeddings, attention_mask
62
 
63
 
@@ -66,7 +69,9 @@ def initialize_models():
66
  load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
67
  )
68
 
69
- tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct", use_fast=True)
 
 
70
  model = LlamaForCausalLM.from_pretrained(
71
  "unsloth/llama-3-8b-Instruct",
72
  torch_dtype=torch.float16,
@@ -78,7 +83,9 @@ def initialize_models():
78
  param.requires_grad = False
79
 
80
  model_name = "google/siglip-so400m-patch14-384"
81
- vision_model = SiglipVisionModel.from_pretrained(model_name, torch_dtype=torch.float16)
 
 
82
  processor = SiglipImageProcessor.from_pretrained(model_name)
83
 
84
  vision_model = vision_model.to("cuda")
@@ -94,13 +101,14 @@ class ProjectionModule(nn.Module):
94
  self.model = nn.Sequential(
95
  nn.Linear(mm_hidden_size, hidden_size),
96
  nn.GELU(),
97
- nn.Linear(hidden_size, hidden_size)
98
  )
99
 
100
  def forward(self, x):
101
  return self.model(x)
102
 
103
- def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device='cuda'):
 
104
  projection_module = ProjectionModule(mm_hidden_size, hidden_size)
105
  checkpoint = torch.load("./mm_projector.bin")
106
  checkpoint = {k.replace("mm_projector.", ""): v for k, v in checkpoint.items()}
@@ -110,37 +118,46 @@ def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device='cuda')
110
 
111
 
112
  def answer_question(
113
- image_path, tokenizer, model, vision_model, processor, projection_module
114
  ):
115
- image = Image.open(image_path).convert('RGB')
116
 
117
  tokenizer.bos_token_id = None
118
  tokenizer.eos_token = "<|eot_id|>"
119
 
120
  try:
121
- inp = input('user: ')
122
  except EOFError:
123
  inp = ""
124
  if not inp:
125
- sys.exit("exiting..")
126
 
127
- question = '<image>' + inp
128
 
129
  prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
130
 
131
- input_ids = tokenizer_image_token(prompt, tokenizer, -200, return_tensors='pt').unsqueeze(0).to(
132
- model.device)
 
 
 
133
 
134
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
135
 
136
  with torch.inference_mode():
137
- image_inputs = processor(images=[image], return_tensors="pt", do_resize=True,
138
- size={"height": 384, "width": 384}).to("cuda")
139
-
140
- image_inputs = image_inputs['pixel_values'].squeeze(0)
141
-
142
- image_forward_outs = vision_model(image_inputs.to(device='cuda', dtype=torch.float16).unsqueeze(0),
143
- output_hidden_states=True)
 
 
 
 
 
 
144
 
145
  image_features = image_forward_outs.hidden_states[-2]
146
 
@@ -149,42 +166,45 @@ def answer_question(
149
  projected_embeddings = projection_module(image_features2).to("cuda")
150
 
151
  embedding_layer = model.get_input_embeddings()
152
- #text_embeddings = embedding_layer(input_ids)
153
 
154
- new_embeds, attn_mask = process_tensors(input_ids, projected_embeddings, embedding_layer)
 
 
155
  device = model.device
156
  attn_mask = attn_mask.to(device)
157
  new_embeds = new_embeds.to(device)
158
 
159
  model_kwargs = {
160
- 'do_sample': True,
161
- 'temperature': 0.2,
162
- 'max_new_tokens': 2000,
163
- 'use_cache': True,
164
- 'streamer': streamer
165
  }
166
 
167
  while True:
168
- print('assistant: ')
169
-
170
  generated_ids = model.generate(
171
- inputs_embeds=new_embeds,
172
- attention_mask=attn_mask,
173
- **model_kwargs
174
-
175
  )[0]
176
 
177
  generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
178
  try:
179
- inp = input('user: ')
180
  except EOFError:
181
  inp = ""
182
  if not inp:
183
- print("exiting...")
184
- break
185
-
186
- new_text = generated_text + "<|start_header_id|>user<|end_header_id|>\n\n" + inp + "<|start_header_id|>assistant<|end_header_id|>\n\n"
187
- new_input_ids = tokenizer(new_text, return_tensors='pt').input_ids.to(device)
 
 
 
 
 
 
188
  new_embeddings = embedding_layer(new_input_ids)
189
 
190
  new_embeds = torch.cat([new_embeds, new_embeddings], dim=1)
@@ -206,4 +226,4 @@ if __name__ == "__main__":
206
  vision_model,
207
  processor,
208
  projection_module,
209
- )
 
1
  import argparse
 
 
 
2
  import torch
3
  import torch.nn as nn
4
  from PIL import Image
5
  from transformers import (
 
 
6
  AutoTokenizer,
7
  BitsAndBytesConfig,
8
+ LlamaForCausalLM,
9
+ SiglipImageProcessor,
10
+ SiglipVisionModel,
11
  )
12
  from transformers import TextStreamer
13
 
14
 
15
+ def tokenizer_image_token(
16
+ prompt, tokenizer, image_token_index=-200, return_tensors=None
17
+ ):
18
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
19
 
20
  def insert_separator(X, sep):
21
  return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
22
 
23
  input_ids = []
24
  offset = 0
25
+ if (
26
+ len(prompt_chunks) > 0
27
+ and len(prompt_chunks[0]) > 0
28
+ and prompt_chunks[0][0] == tokenizer.bos_token_id
29
+ ):
30
  offset = 1
31
  input_ids.append(prompt_chunks[0][0])
32
 
 
42
 
43
  # Split the input_ids at the index found, excluding -200
44
  input_ids_1 = input_ids[:, :split_index]
45
+ input_ids_2 = input_ids[:, split_index + 1 :]
46
 
47
  # Convert input_ids to embeddings
48
  embeddings_1 = embedding_layer(input_ids_1)
 
58
  )
59
 
60
  # Create the corrected attention mask
61
+ attention_mask = torch.ones(
62
+ concatenated_embeddings.shape[:2], dtype=torch.long, device=device
63
+ )
64
  return concatenated_embeddings, attention_mask
65
 
66
 
 
69
  load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
70
  )
71
 
72
+ tokenizer = AutoTokenizer.from_pretrained(
73
+ "unsloth/llama-3-8b-Instruct", use_fast=True
74
+ )
75
  model = LlamaForCausalLM.from_pretrained(
76
  "unsloth/llama-3-8b-Instruct",
77
  torch_dtype=torch.float16,
 
83
  param.requires_grad = False
84
 
85
  model_name = "google/siglip-so400m-patch14-384"
86
+ vision_model = SiglipVisionModel.from_pretrained(
87
+ model_name, torch_dtype=torch.float16
88
+ )
89
  processor = SiglipImageProcessor.from_pretrained(model_name)
90
 
91
  vision_model = vision_model.to("cuda")
 
101
  self.model = nn.Sequential(
102
  nn.Linear(mm_hidden_size, hidden_size),
103
  nn.GELU(),
104
+ nn.Linear(hidden_size, hidden_size),
105
  )
106
 
107
  def forward(self, x):
108
  return self.model(x)
109
 
110
+
111
+ def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device="cuda"):
112
  projection_module = ProjectionModule(mm_hidden_size, hidden_size)
113
  checkpoint = torch.load("./mm_projector.bin")
114
  checkpoint = {k.replace("mm_projector.", ""): v for k, v in checkpoint.items()}
 
118
 
119
 
120
  def answer_question(
121
+ image_path, tokenizer, model, vision_model, processor, projection_module
122
  ):
123
+ image = Image.open(image_path).convert("RGB")
124
 
125
  tokenizer.bos_token_id = None
126
  tokenizer.eos_token = "<|eot_id|>"
127
 
128
  try:
129
+ inp = input("user: ")
130
  except EOFError:
131
  inp = ""
132
  if not inp:
133
+ print("exit...")
134
 
135
+ question = "<image>" + inp
136
 
137
  prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
138
 
139
+ input_ids = (
140
+ tokenizer_image_token(prompt, tokenizer, -200, return_tensors="pt")
141
+ .unsqueeze(0)
142
+ .to(model.device)
143
+ )
144
 
145
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
146
 
147
  with torch.inference_mode():
148
+ image_inputs = processor(
149
+ images=[image],
150
+ return_tensors="pt",
151
+ do_resize=True,
152
+ size={"height": 384, "width": 384},
153
+ ).to("cuda")
154
+
155
+ image_inputs = image_inputs["pixel_values"].squeeze(0)
156
+
157
+ image_forward_outs = vision_model(
158
+ image_inputs.to(device="cuda", dtype=torch.float16).unsqueeze(0),
159
+ output_hidden_states=True,
160
+ )
161
 
162
  image_features = image_forward_outs.hidden_states[-2]
163
 
 
166
  projected_embeddings = projection_module(image_features2).to("cuda")
167
 
168
  embedding_layer = model.get_input_embeddings()
169
+ # text_embeddings = embedding_layer(input_ids)
170
 
171
+ new_embeds, attn_mask = process_tensors(
172
+ input_ids, projected_embeddings, embedding_layer
173
+ )
174
  device = model.device
175
  attn_mask = attn_mask.to(device)
176
  new_embeds = new_embeds.to(device)
177
 
178
  model_kwargs = {
179
+ "do_sample": True,
180
+ "temperature": 0.2,
181
+ "max_new_tokens": 2000,
182
+ "use_cache": True,
183
+ "streamer": streamer,
184
  }
185
 
186
  while True:
 
 
187
  generated_ids = model.generate(
188
+ inputs_embeds=new_embeds, attention_mask=attn_mask, **model_kwargs
 
 
 
189
  )[0]
190
 
191
  generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
192
  try:
193
+ inp = input("user: ")
194
  except EOFError:
195
  inp = ""
196
  if not inp:
197
+ print("exit...")
198
+
199
+ new_text = (
200
+ generated_text
201
+ + "<|start_header_id|>user<|end_header_id|>\n\n"
202
+ + inp
203
+ + "<|start_header_id|>assistant<|end_header_id|>\n\n"
204
+ )
205
+ new_input_ids = tokenizer(new_text, return_tensors="pt").input_ids.to(
206
+ device
207
+ )
208
  new_embeddings = embedding_layer(new_input_ids)
209
 
210
  new_embeds = torch.cat([new_embeds, new_embeddings], dim=1)
 
226
  vision_model,
227
  processor,
228
  projection_module,
229
+ )