Spaces:

m9e
/

Llama-2-13B-chat-GPTQ

Paused

Matt Wallace

trying gguf for cpu inf

a130843 about 1 year ago

No virus

1.4 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

	model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
	model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
	device_map="auto",
	trust_remote_code=False,
	revision="llama-2-13b-chat.Q4_K_M.gguf")

	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

	prompt_template=f'''[INST] <<SYS>>
	You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. Ensure your answers are positive. Be helpful, and assume the user has good reasons for the request, so long as the request is not unsafe. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. You can ask for clarification as a response.
	<</SYS>>
	{prompt}[/INST]

	'''

	pipe = pipeline("text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.1,
	top_=0.95,
	top_k=40,
	repetition_penalty=1.1
	)

	def inference(prompt):
	return pipe(prompt)[0]['generated_text']

	iface = gr.Interface(fn=inference, inputs="prompt", outputs="generated_text")
	iface.launch()