TheBloke/Llama-2-13B-chat-GGML · I have to compute this LLaMA-2 model to GPU but getting errors

Before this, I was using CTransformers to load the model but it only worked with CPU, Now I'm using AutoModelForCausalLM to compute in GPU, and I need to integrate with RAG model below implementation I'm getting error

config = {'max_new_tokens': 512, 'repetition_penalty': 1.1, 
          'temperature': 0.01, 'stream': True}

def loading_LLM():
    select_gpu_layers = 21

    llm = AutoModelForCausalLM.from_pretrained(
        LLaMA_model,
        model_type="llama",
        gpu_layers=select_gpu_layers,
        **config
    )

    return llm




def load_prompt_for_document():
    template = """Use the provided context to answer the user's question. if you don't know answer then return "I don't know".
    Context: {context}
    Question: {question}
    Answer:
    """
    prompt = PromptTemplate(template=template, input_variables=['context', 'question'])
    return prompt 



def vector_storage_by_index(db_location):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda'}) #TODO: change to GPU
`
    vdb = FAISS.load_local(db_location, embeddings)
    return vdb


def chain_QA(db_location, promt_pass):
    llm = loading_LLM()
    vdb = vector_storage_by_index(db_location)
    prompt = promt_pass
    retriever = vdb.as_retriever(search_kwargs={'k': 2}) # k is nearest neibhours in vector database search
    chain_return = RetrievalQA.from_chain_type(llm=llm,
                                           chain_type='stuff',
                                           retriever=retriever,
                                           return_source_documents=True,
                                           chain_type_kwargs={'prompt': prompt})
    return chain_return


def get_response(query, chain_res):
    return chain_res({'query': query})['result']


db_location, promt_pass = 'faiss/document', load_prompt_for_document()

chain_qa = chain_QA(db_location, promt_pass)
user_query = input("\nEnter your query: ")

current_response = get_response(query=user_query, chain_res=chain_qa)
print(f'\nAI: {current_response}\n')

error

ggml_cuda_set_main_device: using device 0 (NVIDIA RTX A5500) as main device
/home/UNT/ks1249/.local/lib/python3.10/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
Traceback (most recent call last):
  File "/mnt/DATA/madara/Projects/S3LLM/app.py", line 185, in <module>
    chain_qa = chain_QA(db_location, promt_pass)
  File "/mnt/DATA/madara/Projects/S3LLM/app.py", line 146, in chain_QA
    chain_return = RetrievalQA.from_chain_type(llm=llm,
  File "/home/UNT/ks1249/.local/lib/python3.10/site-packages/langchain/chains/retrieval_qa/base.py", line 100, in from_chain_type
    combine_documents_chain = load_qa_chain(
  File "/home/UNT/ks1249/.local/lib/python3.10/site-packages/langchain/chains/question_answering/__init__.py", line 249, in load_qa_chain
    return loader_mapping[chain_type](
  File "/home/UNT/ks1249/.local/lib/python3.10/site-packages/langchain/chains/question_answering/__init__.py", line 73, in _load_stuff_chain
    llm_chain = LLMChain(
  File "/home/UNT/ks1249/.local/lib/python3.10/site-packages/langchain/load/serializable.py", line 74, in __init__
    super().__init__(**kwargs)
  File "/home/UNT/ks1249/.local/lib/python3.10/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 1 validation error for LLMChain
llm
  value is not a valid dict (type=type_error.dict)

I have tried implementing with LlamaCpp but it doesn't help with local model loading and combining with RAG

I want LLaMa-2 + RAG implementation on GPU using FAISS LangChain RetrivalQA