Spaces:

valencar
/

chat-llama-agosto

Sleeping

chat-llama-agosto / app.py

Update app.py

133cb70 verified 3 months ago

1.44 kB

	import streamlit as st
	# Load model directly
	# from transformers import AutoModel, AutoModelForCausalLM
	from huggingface_hub import login
	import os

	access_token = os.getenv('HF_TOKEN')
	login(token = access_token)

	file = 'llama-2-7b.Q5_0.gguf'

	from llama_cpp import Llama

	llm = Llama(
	model_path="./" + file,
	# n_gpu_layers=-1, # Uncomment to use GPU acceleration
	# seed=1337, # Uncomment to set a specific seed
	# n_ctx=2048, # Uncomment to increase the context window
	)

	prompt = "Q: Name the planets in the solar system? A: "
	output = llm(
	prompt, # Prompt
	max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
	stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
	echo=True # Echo the prompt back in the output
	) # Generate a completion, can also call create_completion
	print(output)



	# NO_GPU = 0
	# GPU_LAYERS = 50

	# llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU)

	# # model = AutoModelForCausalLM.from_pretrained("valencar/llamm",
	# # model_file=file, model_type="llama", gpu_layers=NO_GPU)

	# # access_token = os.getenv('HF_TOKEN2')
	# # login(token = access_token)

	# prompt = "AI is going to"

	with st.container():
	st.write('\n\n')
	st.write(prompt)
	answer = output
	st.write(answer)
	print(answer)