blockblockblock
/

Quiet-Star-Custom-bpw4.8

Text Generation

Model card Files Files and versions Community

Quiet-Star-Custom-bpw4.8 / inference.py

blockblockblock's picture

blockblockblock

Upload folder using huggingface_hub

750e29f verified 3 months ago

raw history blame contribute delete

No virus

991 Bytes

	import torch
	from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM

	model_path = "Crystalcareai/GemMoE-Medium-v0.4"

	# Load model
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	low_cpu_mem_usage=True,
	torch_dtype=torch.float16,
	attn_implementation="flash_attention_2"
	trust_remote_code=True,
	)

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Convert prompt to tokens
	prompt_template = "[INST] {prompt} [/INST]"

	prompt = "You're standing on the surface of the Earth. "\
	"You walk one mile south, one mile west and one mile north. "\
	"You end up exactly where you started. Where are you?"

	tokens = tokenizer(
	prompt_template.format(prompt=prompt),
	return_tensors='pt'
	).input_ids.cuda()

	# Generate output
	generation_output = model.generate(
	tokens,
	streamer=streamer,
	max_new_tokens=512
	)