Generative_QNA / run_llama.py
Omkar008's picture
Update run_llama.py
f7b1f6b
import logging
from langchain.llms import CTransformers
from huggingface_hub import hf_hub_download
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import torch
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
def load_models(model_id, model_basename=None):
#Check if GPU is not available then loading the model on CPU
if torch.cuda.is_available():
logging.info("Using Llama-2-7b-Chat-GPTQ")
local_llm =CTransformers(model="TheBloke/Llama-2-7b-Chat-GPTQ")
else:
print("Using LLM on CPU")
local_llm = LlamaCpp(
model_path="llama-2-7b-chat.Q4_K_M.gguf.gguf",
temperature=0.75,
max_tokens=2000,
top_p=1,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
n_ctx = 2048
)
return local_llm