TSLAM-1.5B / inference.py
NetoAI's picture
Update inference.py
f2b069a verified
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import LogitsProcessor, LogitsProcessorList
class EOSLogitsBiasProcessor(LogitsProcessor):
def __init__(self, eos_token_id, bias_strength):
self.eos_token_id = eos_token_id
self.bias_strength = bias_strength
def __call__(self, input_ids, scores):
# Apply bias to the EOS token's logit
scores[:, self.eos_token_id] += self.bias_strength
return scores
# Load the model and tokenizer with quantization
model = AutoModelForCausalLM.from_pretrained(".",trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(".", trust_remote_code=True)
eos_token_id = tokenizer.eos_token_id
begin_of_text = tokenizer.bos_token
end_of_text = tokenizer.eos_token
bias_strength = 3.5 # Adjust this value based on performance
logits_processor = LogitsProcessorList([
EOSLogitsBiasProcessor(eos_token_id, bias_strength)
])
question = "How QOS is applied on routers?"
formatted_text = f"Question: {question} Answer: "
# Tokenize the formatted text
inputs = tokenizer(formatted_text, return_tensors="pt", return_attention_mask=True).to("cuda" if torch.cuda.is_available() else "cpu")
# Generate the output using the model
outputs = model.generate(
**inputs,
max_length=2000,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.2,
logits_processor=logits_processor
)
# Decode the output
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Optionally, print the generated answer
print(text)