Coool2 commited on
Commit
0007857
·
1 Parent(s): 5fa5125

Update agent.py

Browse files
Files changed (1) hide show
  1. agent.py +31 -19
agent.py CHANGED
@@ -120,7 +120,8 @@ def initialize_models(use_api_mode=False):
120
  print("Initializing models in non-API mode with local models...")
121
 
122
  try :
123
- from typing import Any, Optional, List, Mapping
 
124
  from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
125
  from llama_index.core.llms.callbacks import llm_completion_callback
126
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
@@ -128,16 +129,18 @@ def initialize_models(use_api_mode=False):
128
  import torch
129
 
130
  class QwenVL7BCustomLLM(CustomLLM):
131
- context_window: int = 32768
132
- num_output: int = 256
133
- model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"
134
- _device: str = PrivateAttr()
135
- def __init__(self, device: str = "cuda", **kwargs):
136
- self._device = "cuda"
137
- self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
 
138
  self.model_name, torch_dtype=torch.bfloat16, device_map="auto"
139
  )
140
- self.processor = AutoProcessor.from_pretrained(self.model_name)
141
 
142
  @property
143
  def metadata(self) -> LLMMetadata:
@@ -148,41 +151,50 @@ def initialize_models(use_api_mode=False):
148
  )
149
 
150
  @llm_completion_callback()
151
- def complete(self, prompt: str, image_paths: Optional[List[str]] = None, **kwargs: Any) -> CompletionResponse:
152
- # Prepare messages for multimodal input
 
 
 
 
 
153
  messages = [{"role": "user", "content": []}]
154
  if image_paths:
155
  for path in image_paths:
156
  messages[0]["content"].append({"type": "image", "image": path})
157
  messages[0]["content"].append({"type": "text", "text": prompt})
158
 
159
- # Process inputs
160
- text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
161
  image_inputs, video_inputs = process_vision_info(messages)
162
- inputs = self.processor(
163
  text=[text],
164
  images=image_inputs,
165
  videos=video_inputs,
166
  padding=True,
167
  return_tensors="pt",
168
  )
169
- inputs = inputs.to(self.model.device)
170
 
171
  # Generate output
172
- generated_ids = self.model.generate(**inputs, max_new_tokens=self.num_output)
173
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
174
- output_text = self.processor.batch_decode(
175
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
176
  )[0]
177
  return CompletionResponse(text=output_text)
178
 
179
  @llm_completion_callback()
180
- def stream_complete(self, prompt: str, image_paths: Optional[List[str]] = None, **kwargs: Any) -> CompletionResponseGen:
 
 
 
 
 
181
  response = self.complete(prompt, image_paths)
182
  for token in response.text:
183
  yield CompletionResponse(text=token, delta=token)
184
 
185
-
186
  proj_llm = QwenVL7BCustomLLM()
187
 
188
  # Code LLM
 
120
  print("Initializing models in non-API mode with local models...")
121
 
122
  try :
123
+ from typing import Optional, List, Any
124
+ from pydantic import Field, PrivateAttr
125
  from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
126
  from llama_index.core.llms.callbacks import llm_completion_callback
127
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
129
  import torch
130
 
131
  class QwenVL7BCustomLLM(CustomLLM):
132
+ model_name: str = Field(default="Qwen/Qwen2.5-VL-7B-Instruct")
133
+ context_window: int = Field(default=32768)
134
+ num_output: int = Field(default=256)
135
+ _model = PrivateAttr()
136
+ _processor = PrivateAttr()
137
+
138
+ def __init__(self, **kwargs):
139
+ super().__init__(**kwargs)
140
+ self._model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
141
  self.model_name, torch_dtype=torch.bfloat16, device_map="auto"
142
  )
143
+ self._processor = AutoProcessor.from_pretrained(self.model_name)
144
 
145
  @property
146
  def metadata(self) -> LLMMetadata:
 
151
  )
152
 
153
  @llm_completion_callback()
154
+ def complete(
155
+ self,
156
+ prompt: str,
157
+ image_paths: Optional[List[str]] = None,
158
+ **kwargs: Any
159
+ ) -> CompletionResponse:
160
+ # Prepare multimodal input
161
  messages = [{"role": "user", "content": []}]
162
  if image_paths:
163
  for path in image_paths:
164
  messages[0]["content"].append({"type": "image", "image": path})
165
  messages[0]["content"].append({"type": "text", "text": prompt})
166
 
167
+ # Tokenize and process
168
+ text = self._processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
169
  image_inputs, video_inputs = process_vision_info(messages)
170
+ inputs = self._processor(
171
  text=[text],
172
  images=image_inputs,
173
  videos=video_inputs,
174
  padding=True,
175
  return_tensors="pt",
176
  )
177
+ inputs = inputs.to(self._model.device)
178
 
179
  # Generate output
180
+ generated_ids = self._model.generate(**inputs, max_new_tokens=self.num_output)
181
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
182
+ output_text = self._processor.batch_decode(
183
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
184
  )[0]
185
  return CompletionResponse(text=output_text)
186
 
187
  @llm_completion_callback()
188
+ def stream_complete(
189
+ self,
190
+ prompt: str,
191
+ image_paths: Optional[List[str]] = None,
192
+ **kwargs: Any
193
+ ) -> CompletionResponseGen:
194
  response = self.complete(prompt, image_paths)
195
  for token in response.text:
196
  yield CompletionResponse(text=token, delta=token)
197
 
 
198
  proj_llm = QwenVL7BCustomLLM()
199
 
200
  # Code LLM