import streamlit as st | |
# Load model directly | |
# from transformers import AutoModel, AutoModelForCausalLM | |
# from huggingface_hub import login | |
import os | |
access_token = os.getenv('HF_TOKEN3') | |
login(token = access_token) | |
file = 'llama-2-7b.Q5_0.gguf' | |
from llama_cpp import Llama | |
llm = Llama( | |
model_path="./" + file, | |
# n_gpu_layers=-1, # Uncomment to use GPU acceleration | |
# seed=1337, # Uncomment to set a specific seed | |
# n_ctx=2048, # Uncomment to increase the context window | |
) | |
prompt = "Q: Name the planets in the solar system? A: " | |
output = llm( | |
prompt, # Prompt | |
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window | |
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question | |
echo=True # Echo the prompt back in the output | |
) # Generate a completion, can also call create_completion | |
print(output) | |
# NO_GPU = 0 | |
# GPU_LAYERS = 50 | |
# llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU) | |
# # model = AutoModelForCausalLM.from_pretrained("valencar/llamm", | |
# # model_file=file, model_type="llama", gpu_layers=NO_GPU) | |
# # access_token = os.getenv('HF_TOKEN2') | |
# # login(token = access_token) | |
# prompt = "AI is going to" | |
with st.container(): | |
st.write('\n\n') | |
st.write(prompt) | |
answer = output | |
st.write(answer) | |
print(answer) |