Spaces:

DevQuasar
/

Brainstorm_CPU

Running

Brainstorm_CPU / app.py

Upload 2 files

a8408b2 verified 5 months ago

1.65 kB

	import gradio as gr
	from llama_cpp import Llama


	def llama_cpp_chat(gguf_model, prompt:str, messages:str = ''):
	prompt_templated = f'{messages}\n ### HUMAN:\n{prompt} \n ### ASSISTANT:'
	output = gguf_model(
	prompt_templated, # Prompt
	max_tokens=512,
	stop=["### HUMAN:\n", " ### ASSISTANT:"], # Stop generating just before the model would generate a new question
	echo=True # Echo the prompt back in the output
	) # Generate a completion, can also call create_completion
	print(output)
	return output['choices'][0]['text']

	llm = Llama(
	model_path="llama3_8b_chat_brainstorm.Q2_K.gguf",
	# n_gpu_layers=-1, # Uncomment to use GPU acceleration
	# seed=1337, # Uncomment to set a specific seed
	# n_ctx=2048, # Uncomment to increase the context window
	)

	def chatty(prompt, messages):
	print(prompt)
	print(f'messages: {messages}')
	past_messages = ''
	if len(messages) > 0:
	for idx, message in enumerate(messages):
	print(f'idx: {idx}, message: {message}')
	past_messages += f'\n### HUMAN: {message[0]}'
	past_messages += f'\n### ASSISTANT: {message[1]}'


	# past_messages = messages[0][0]
	print(f'past_messages: {past_messages}')
	messages = llama_cpp_chat(llm, prompt, past_messages)
	return messages.split('### ASSISTANT:')[-1]


	demo = gr.ChatInterface(
	fn=chatty,
	title="Brainstorm on CPU with llama.cpp",
	description="Please note that CPU prediction will very slow - but this can run on the Free Tier :)"
	)


	if __name__ == "__main__":
	demo.launch()