NetoAISolutions
/

TSLAM-1.5B

Model card Files Files and versions Community

TSLAM-1.5B / inference.py

NetoAI's picture

Update inference.py

f2b069a verified about 2 months ago

history blame contribute delete

1.64 kB

	import torch
	import transformers
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers import LogitsProcessor, LogitsProcessorList

	class EOSLogitsBiasProcessor(LogitsProcessor):
	def __init__(self, eos_token_id, bias_strength):
	self.eos_token_id = eos_token_id
	self.bias_strength = bias_strength

	def __call__(self, input_ids, scores):
	# Apply bias to the EOS token's logit
	scores[:, self.eos_token_id] += self.bias_strength
	return scores

	# Load the model and tokenizer with quantization
	model = AutoModelForCausalLM.from_pretrained(".",trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = AutoTokenizer.from_pretrained(".", trust_remote_code=True)

	eos_token_id = tokenizer.eos_token_id
	begin_of_text = tokenizer.bos_token
	end_of_text = tokenizer.eos_token

	bias_strength = 3.5 # Adjust this value based on performance
	logits_processor = LogitsProcessorList([
	EOSLogitsBiasProcessor(eos_token_id, bias_strength)
	])

	question = "How QOS is applied on routers?"

	formatted_text = f"Question: {question} Answer: "
	# Tokenize the formatted text
	inputs = tokenizer(formatted_text, return_tensors="pt", return_attention_mask=True).to("cuda" if torch.cuda.is_available() else "cpu")

	# Generate the output using the model
	outputs = model.generate(
	**inputs,
	max_length=2000,
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=1.2,
	logits_processor=logits_processor
	)

	# Decode the output
	text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

	# Optionally, print the generated answer
	print(text)