Spaces:
Paused
Paused
import logging | |
from langchain.llms import CTransformers | |
from huggingface_hub import hf_hub_download | |
from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
import torch | |
from langchain.llms import LlamaCpp | |
from langchain.callbacks.manager import CallbackManager | |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) | |
def load_models(model_id, model_basename=None): | |
#Check if GPU is not available then loading the model on CPU | |
if torch.cuda.is_available(): | |
logging.info("Using Llama-2-7b-Chat-GPTQ") | |
local_llm =CTransformers(model="TheBloke/Llama-2-7b-Chat-GPTQ") | |
else: | |
print("Using LLM on CPU") | |
local_llm = LlamaCpp( | |
model_path="llama-2-7b-chat.Q4_K_M.gguf.gguf", | |
temperature=0.75, | |
max_tokens=2000, | |
top_p=1, | |
callback_manager=callback_manager, | |
verbose=True, # Verbose is required to pass to the callback manager | |
n_ctx = 2048 | |
) | |
return local_llm | |