neuralworm commited on
Commit
150947d
·
1 Parent(s): 0d9095f

fix for gemma

Browse files
bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED
Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ
 
repo.txt CHANGED
@@ -181,7 +181,7 @@ import torch, random, numpy as np
181
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
182
  from typing import List, Optional
183
 
184
- DEBUG = 1
185
 
186
  def dbg(*args):
187
  if DEBUG:
@@ -192,54 +192,63 @@ class LLM:
192
  self.model_id = model_id
193
  self.seed = seed
194
 
 
 
 
 
 
 
 
 
 
 
195
  set_seed(seed)
 
196
  token = os.environ.get("HF_TOKEN")
 
 
197
 
198
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
199
- if self.tokenizer.pad_token is None:
200
- self.tokenizer.pad_token = self.tokenizer.eos_token
201
-
202
  kwargs = {}
203
- if torch.cuda.is_available():
204
- kwargs["torch_dtype"] = torch.bfloat16
205
 
206
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
207
  self.model.eval()
 
208
 
209
- dbg(f"Loaded model: {model_id}")
210
 
211
- def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
 
 
212
  set_seed(self.seed)
213
 
214
- messages = [
215
- {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
216
- ]
217
-
218
- # Using a simpler user-only template that is robust for Gemma
219
- prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
220
 
221
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
222
  input_token_length = inputs.input_ids.shape[1]
223
 
224
  with torch.no_grad():
225
- terminators = [
226
- self.tokenizer.eos_token_id,
227
- self.tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in self.tokenizer.additional_special_tokens else self.tokenizer.eos_token_id
228
- ]
229
-
230
  out = self.model.generate(
231
  **inputs,
232
- do_sample=(temperature > 0 and temperature < 1.0),
233
- temperature=max(temperature, 0.01),
234
- max_new_tokens=200,
235
- eos_token_id=terminators,
 
236
  pad_token_id=self.tokenizer.eos_token_id
237
  )
238
 
239
- completion = self.tokenizer.decode(out[0, input_token_length:], skip_special_tokens=True)
 
240
 
241
- dbg("Cleaned Agent Completion:", completion)
242
- return completion
243
 
244
  [File Ends] bp_phi/llm_iface.py
245
 
 
181
  from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
182
  from typing import List, Optional
183
 
184
+ DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
185
 
186
  def dbg(*args):
187
  if DEBUG:
 
192
  self.model_id = model_id
193
  self.seed = seed
194
 
195
+ # Set all seeds for reproducibility
196
+ random.seed(seed)
197
+ np.random.seed(seed)
198
+ torch.manual_seed(seed)
199
+ if torch.cuda.is_available():
200
+ torch.cuda.manual_seed_all(seed)
201
+ try:
202
+ torch.use_deterministic_algorithms(True, warn_only=True)
203
+ except Exception as e:
204
+ dbg(f"Could not set deterministic algorithms: {e}")
205
  set_seed(seed)
206
+
207
  token = os.environ.get("HF_TOKEN")
208
+ if not token and ("gemma-3" in model_id or "llama" in model_id):
209
+ print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
210
 
211
  self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
 
 
 
212
  kwargs = {}
213
+ if dtype == "float16": kwargs["torch_dtype"] = torch.float16
214
+ elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
215
 
216
  self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
217
  self.model.eval()
218
+ self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
219
 
220
+ dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
221
 
222
+ def generate_json(self, system_prompt: str, user_prompt: str,
223
+ max_new_tokens: int = 256, temperature: float = 0.7,
224
+ top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
225
  set_seed(self.seed)
226
 
227
+ if self.is_instruction_tuned:
228
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
229
+ prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
230
+ else:
231
+ prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
 
232
 
233
  inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
234
  input_token_length = inputs.input_ids.shape[1]
235
 
236
  with torch.no_grad():
 
 
 
 
 
237
  out = self.model.generate(
238
  **inputs,
239
+ do_sample=(temperature > 0),
240
+ temperature=temperature,
241
+ top_p=top_p,
242
+ max_new_tokens=max_new_tokens,
243
+ num_return_sequences=num_return_sequences,
244
  pad_token_id=self.tokenizer.eos_token_id
245
  )
246
 
247
+ new_tokens = out[:, input_token_length:]
248
+ completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
249
 
250
+ dbg("Cleaned model completions:", completions)
251
+ return completions
252
 
253
  [File Ends] bp_phi/llm_iface.py
254