import time import torch # from auto_gptq import AutoGPTQForCausalLM from huggingface_hub import hf_hub_download from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.llms import HuggingFacePipeline, LlamaCpp from langchain.prompts import PromptTemplate from langchain.vectorstores import Chroma from transformers import ( AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, ) from constants import CHROMA_SETTINGS, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY def load_model(model_id, model_basename=None): if model_basename is not None: if ".ggml" in model_basename: model_path = hf_hub_download( repo_id=model_id, filename=model_basename ) max_ctx_size = 2048 kwargs = {"model_path": model_path, "n_ctx": max_ctx_size, "max_tokens": max_ctx_size, "n_gpu_layers": 1000, "n_batch": max_ctx_size} return LlamaCpp(**kwargs) # else: # if ".safetensors" in model_basename: # model_basename = model_basename.replace(".safetensors", "") # tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) # model = AutoGPTQForCausalLM.from_quantized( # model_id, # model_basename=model_basename, # use_safetensors=True, # trust_remote_code=True, # device="cuda:0", # use_triton=False, # quantize_config=None, # ) else: tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True, trust_remote_code=True, ) model.tie_weights() generation_config = GenerationConfig.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, min_length=512, max_length=2048, temperature=0, top_p=0.95, repetition_penalty=1.15, generation_config=generation_config, ) return HuggingFacePipeline(pipeline=pipe) def load_qa(): embeddings = HuggingFaceInstructEmbeddings( model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cuda"} ) db = Chroma( persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, ) retriever = db.as_retriever() model_id = "psmathur/orca_mini_3b" model_basename = None # model_id = "TheBloke/vicuna-7B-1.1-HF" # model_basename = None template = """You are an AI assistant for answering questions about logotherapy. You are given the following extracted parts of a annual academic journal. Provide a very detailed comprehensive academic answer. If you don't know the answer, just say "I'm not sure." Don't try to make up an answer. If the question is not about the psychotherapy and not directly in the given context, politely inform them that you are tuned to only answer questions about logotherapy. Question: {question} ========= {context} ========= Answer:""" prompt = PromptTemplate( input_variables=["context", "question"], template=template ) llm = load_model( model_id=model_id, model_basename=model_basename ) qa = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": prompt}, ) return qa