Multi-GPU inference using accelerate
#23
by
dataviral
- opened
- modeling_mpt.py +1 -1
modeling_mpt.py
CHANGED
@@ -235,7 +235,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
|
|
235 |
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
236 |
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
237 |
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
|
238 |
-
logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
|
239 |
if self.logit_scale is not None:
|
240 |
if self.logit_scale == 0:
|
241 |
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
|
|
|
235 |
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
236 |
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
237 |
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
|
238 |
+
logits = F.linear(outputs.last_hidden_state.to(self.transformer.wte.weight.device), self.transformer.wte.weight)
|
239 |
if self.logit_scale is not None:
|
240 |
if self.logit_scale == 0:
|
241 |
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
|