Spaces:
Sleeping
Sleeping
File size: 3,617 Bytes
91d7875 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
from langchain.chains import RetrievalQA
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline
from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import click
import torch
from constants import CHROMA_SETTINGS
def load_model(device):
"""
Select a model on huggingface.
If you are running this for the first time, it will download a model for you.
subsequent runs will use the model from the disk.
"""
model = "tiiuae/falcon-7b-instruct"
if device == "cuda":
tokenizer = AutoTokenizer.from_pretrained(model)
else: # cpu
tokenizer=AutoTokenizer.from_pretrained(model)
model=AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float32 if device =="cpu" else torch.bfloat16,
trust_remote_code=True,
device_map=device if device =="cpu" else "auto",
max_length=2048,
temperature=0,
top_p=0.95,
top_k=10,
repetition_penalty=1.15,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id
)
local_llm = HuggingFacePipeline(pipeline=pipe)
return local_llm
# @click.command()
# @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
# def main(device_type, ):
# # load the instructorEmbeddings
# if device_type in ['cpu', 'CPU']:
# device='cpu'
# else:
# device='cuda'
## for M1/M2 users:
@click.command()
@click.option('--device_type', default='cuda', help='device to run on, select gpu, cpu or mps')
def main(device_type, ):
# load the instructorEmbeddings
if device_type in ['cpu', 'CPU']:
device='cpu'
elif device_type in ['mps', 'MPS']:
device='mps'
else:
device='cuda'
print(f"Running on: {device}")
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base",
model_kwargs={"device": device})
# load the vectorstore
db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
retriever = db.as_retriever()
# Prepare the LLM
# callbacks = [StreamingStdOutCallbackHandler()]
# load the LLM for generating Natural Language responses.
llm = load_model(device)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
# Interactive questions and answers
while True:
query = input("\nEnter a query: ")
if query == "exit":
break
# Get the answer from the chain
res = qa(query)
answer, docs = res['result'], res['source_documents']
# Print the result
print("\n\n> Question:")
print(query)
print("\n> Answer:")
print(answer)
# Print the relevant sources used for the answer
print("----------------------------------SOURCE DOCUMENTS---------------------------")
for document in docs:
print("\n> " + document.metadata["source"] + ":")
print(document.page_content)
print("----------------------------------SOURCE DOCUMENTS---------------------------")
if __name__ == "__main__":
main()
|