|
import torch |
|
import transformers |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from transformers import LogitsProcessor, LogitsProcessorList |
|
|
|
class EOSLogitsBiasProcessor(LogitsProcessor): |
|
def __init__(self, eos_token_id, bias_strength): |
|
self.eos_token_id = eos_token_id |
|
self.bias_strength = bias_strength |
|
|
|
def __call__(self, input_ids, scores): |
|
|
|
scores[:, self.eos_token_id] += self.bias_strength |
|
return scores |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(".",trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu") |
|
tokenizer = AutoTokenizer.from_pretrained(".", trust_remote_code=True) |
|
|
|
eos_token_id = tokenizer.eos_token_id |
|
begin_of_text = tokenizer.bos_token |
|
end_of_text = tokenizer.eos_token |
|
|
|
bias_strength = 3.5 |
|
logits_processor = LogitsProcessorList([ |
|
EOSLogitsBiasProcessor(eos_token_id, bias_strength) |
|
]) |
|
|
|
question = "How QOS is applied on routers?" |
|
|
|
formatted_text = f"Question: {question} Answer: " |
|
|
|
inputs = tokenizer(formatted_text, return_tensors="pt", return_attention_mask=True).to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
outputs = model.generate( |
|
**inputs, |
|
max_length=2000, |
|
pad_token_id=tokenizer.eos_token_id, |
|
repetition_penalty=1.2, |
|
logits_processor=logits_processor |
|
) |
|
|
|
|
|
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
|
|
|
print(text) |