import logging from langchain.llms import CTransformers from huggingface_hub import hf_hub_download from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler import torch from langchain.llms import LlamaCpp from langchain.callbacks.manager import CallbackManager callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) def load_models(model_id, model_basename=None): #Check if GPU is not available then loading the model on CPU if torch.cuda.is_available(): logging.info("Using Llama-2-7b-Chat-GPTQ") local_llm =CTransformers(model="TheBloke/Llama-2-7b-Chat-GPTQ") else: print("Using LLM on CPU") local_llm = LlamaCpp( model_path="llama-2-7b-chat.Q4_K_M.gguf.gguf", temperature=0.75, max_tokens=2000, top_p=1, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager n_ctx = 2048 ) return local_llm