import streamlit as st import os from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings,CohereEmbeddings from langchain_openai import OpenAIEmbeddings,ChatOpenAI from langchain_community.chat_models import ChatCohere from langchain_community.vectorstores import FAISS from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from htmlTemplates import css, bot_template, user_template from langchain_community.llms import HuggingFaceHub,HuggingFaceTextGenInference #Llama2 import torch import transformers from langchain_community.llms import HuggingFacePipeline from transformers import AutoTokenizer from torch import cuda, bfloat16 import langchain langchain.verbose = False def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=500, # the character length of the chunck chunk_overlap=100, # the character length of the overlap between chuncks length_function=len # the length function - in this case, character length (aka the python len() fn.) ) chunks = text_splitter.split_text(text) return chunks def get_vectorstore(text_chunks,selected_embedding): print('Selected Embedding: ' + selected_embedding) if selected_embedding == 'OpenAI': embeddings = OpenAIEmbeddings() elif selected_embedding == 'Instructor-xl': embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") elif selected_embedding == 'Cohere-multilingual-v3.0': embeddings = CohereEmbeddings(model="embed-multilingual-v3.0") vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) vectorstore.save_local("faiss_index") return vectorstore def load_vectorstore(text_chunks,selected_embedding): print('Selected Embedding: ' + selected_embedding) if selected_embedding == 'OpenAI': embeddings = OpenAIEmbeddings() elif selected_embedding == 'Instructor-xl': embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") vectorstore = FAISS.load_local("faiss_index", embeddings) elif selected_embedding == 'Cohere-multilingual-v3.0': embeddings = CohereEmbeddings(model="embed-multilingual-v3.0") vectorstore = FAISS.load_local("faiss_index", embeddings) return vectorstore def get_conversation_chain(vectorstore,selected_llm,selected_temperature): print('Selected LLM: ' + selected_llm) print('Selected Temperature: ' + str(selected_temperature)) if selected_llm == 'GPT 3.5': #openai_model = "gpt-4-turbo-preview" openai_model = "gpt-3.5-turbo" llm = ChatOpenAI(model=openai_model,temperature=selected_temperature) elif selected_llm == 'Llama2 local': model_id = 'meta-llama/Llama-2-7b-chat-hf' hf_auth = os.environ.get("HUGGINGFACEHUB_API_TOKEN") model_config = transformers.AutoConfig.from_pretrained( model_id, token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' if('cuda' in device): # set quantization configuration to load large model with less GPU memory # this requires the `bitsandbytes` library bnb_config = transformers.BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=bfloat16 ) model = transformers.AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, config=model_config, quantization_config=bnb_config, device_map='auto', token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) else: model = transformers.AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, config=model_config, device_map='auto', token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) # enable evaluation mode to allow model inference model.eval() print(f"Model loaded on {device}") tokenizer = transformers.AutoTokenizer.from_pretrained( model_id, token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) pipeline = transformers.pipeline( torch_dtype=torch.float32, model=model, tokenizer=tokenizer, return_full_text=True, # langchain expects the full text task='text-generation', temperature=selected_temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max max_new_tokens=512, # max number of tokens to generate in the output repetition_penalty=1.1 # without this output begins repeating ) llm = HuggingFacePipeline(pipeline=pipeline) elif selected_llm == 'Llama2 inference': llm = HuggingFaceTextGenInference( inference_server_url=os.environ.get("INFERENCE_URL"), max_new_tokens=50, timeout=1200, temperature=selected_temperature ) # Generic LLM memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True, output_key='answer') conversation_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(), memory=memory, return_source_documents=True, verbose=True, ) #print(conversation_chain) return conversation_chain def handle_userinput(user_question): #print('Question: ' + user_question) response = st.session_state.conversation.invoke({'question': user_question}) anser = response.get("answer") sources = response.get("source_documents", []) #print('Answer: ' + anser) #print('Sources: ' + str(sources)) with st.expander("Sources"): st.write(str(sources)) st.session_state.chat_history = response['chat_history'] for i, message in enumerate(st.session_state.chat_history): if i % 2 == 0: st.write(user_template.replace( "{{MSG}}", message.content), unsafe_allow_html=True) else: st.write(bot_template.replace( "{{MSG}}", message.content), unsafe_allow_html=True) def main(): load_dotenv() st.set_page_config(page_title="VerAi", page_icon=":books:") st.write(css, unsafe_allow_html=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None with st.sidebar: st.subheader("Your documents") pdf_docs = st.file_uploader( "Upload your new PDFs here and click on 'Process' or load the last upload by clicking on 'Load'", accept_multiple_files=True) selected_embedding = st.radio("Which Embedding?",["Cohere-multilingual-v3.0","OpenAI", "Instructor-xl"]) selected_llm = st.radio("Which LLM?",["GPT 3.5", "Llama2 local" ,"Llama2 inference"]) selected_temperature = st.slider('Temperature?', 0.0, 1.0, 0.1) if st.button("Process"): with st.spinner("Processing"): # get pdf text raw_text = get_pdf_text(pdf_docs) # get the text chunks text_chunks = get_text_chunks(raw_text) # create vector store vectorstore = get_vectorstore(text_chunks,selected_embedding) # create conversation chain st.session_state.conversation = get_conversation_chain( vectorstore,selected_llm,selected_temperature) if st.button("Load"): with st.spinner("Processing"): # load vector store vectorstore = load_vectorstore(selected_embedding,selected_embedding) # create conversation chain st.session_state.conversation = get_conversation_chain( vectorstore,selected_llm,selected_temperature) if st.session_state.conversation: st.header("VerAi :books:") user_question = st.text_input("Stel een vraag hieronder") # Vertel me iets over Wettelijke uren # wat zijn Overige verloftypes bij kpn if st.session_state.conversation and user_question: handle_userinput(user_question) if __name__ == '__main__': main()