import os from langchain.document_loaders import DirectoryLoader from langchain.document_loaders import BSHTMLLoader from bs4 import SoupStrainer import re from langchain import HuggingFaceHub, PromptTemplate, LLMChain from langchain.embeddings import SentenceTransformerEmbeddings from langchain.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.memory import ChatMessageHistory, ConversationBufferMemory import chainlit as cl # llm model_id = "tiiuae/falcon-7b-instruct" conv_model = HuggingFaceHub( huggingfacehub_api_token=os.environ['HF_API_TOKEN'], repo_id=model_id, model_kwargs={"temperature":0.8,"max_length": 1000} ) # chroma data_path = "data/html" embed_model = "all-MiniLM-L6-v2" # Chroma defaults to "sentence-transformers/all-MiniLM-L6-v2" # load documents def load_documents(directory): # define Beautiful Soup key word args bs_kwargs = { "features": "html.parser", "parse_only": SoupStrainer("p") # only include relevant text } # define Loader key word args loader_kwargs = { "open_encoding": "utf-8", "bs_kwargs": bs_kwargs } # define Loader loader = DirectoryLoader( path=directory, glob="*.html", loader_cls=BSHTMLLoader, loader_kwargs=loader_kwargs ) documents = loader.load() return documents # prepare documents def prepare_documents(documents): for doc in documents: doc.page_content = doc.page_content.replace("\n", " ").replace("\t", " ") doc.page_content = re.sub("\\s+", " ", doc.page_content) # define Beautiful Soup key word args bs_kwargs = { "features": "html.parser", "parse_only": SoupStrainer("title") # only include relevant text } # define Loader key word args loader_kwargs = { "open_encoding": "utf-8", "bs_kwargs": bs_kwargs } loader = DirectoryLoader( path=data_path, glob="*.html", loader_cls=BSHTMLLoader, loader_kwargs=loader_kwargs ) document_sources = loader.load() # convert source metadata into a list source_list = [doc.metadata["title"] for doc in document_sources] # update source metadata i = 0 for doc in documents: doc.metadata["source"] = " ".join(["FAR", source_list[i]]) i += 1 return documents @cl.on_chat_start async def on_chat_start(): # Instantiate the chain for that user session embedding_func = SentenceTransformerEmbeddings(model_name=embed_model) msg = cl.Message( content="Loading and processing documents. This may take a while...", disable_human_feedback=True) await msg.send() documents = load_documents(data_path) documents = prepare_documents(documents) docsearch = await cl.make_async(Chroma.from_documents)( documents, embedding_func ) message_history = ChatMessageHistory() memory = ConversationBufferMemory( memory_key="chat_history", output_key="answer", chat_memory=message_history, return_messages=True, ) chain = ConversationalRetrievalChain.from_llm( conv_model, chain_type="stuff", retriever=docsearch.as_retriever(), memory=memory, return_source_documents=True, ) msg.content = "Ready. You can now ask questions!" await msg.update() cl.user_session.set("chain", chain) @cl.on_message async def main(message): chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain cb = cl.AsyncLangchainCallbackHandler() res = await chain.acall(message.content, callbacks=[cb]) answer = res["answer"] source_documents = res["source_documents"] text_elements = [] source_names = set() # Use a set to store unique source names for idx, source_doc in enumerate(source_documents): source_name = source_doc.metadata["source"] text_elements.append( cl.Text(content=source_doc.page_content, name=source_name)) source_names.add(source_name) # Add the source name to the set if source_names: answer += f"\nSources: {', '.join(source_names)}" else: answer += "\nNo sources found" await cl.Message(content=answer, elements=text_elements).send()