|
import os |
|
|
|
from langchain.document_loaders import DirectoryLoader |
|
from langchain.document_loaders import BSHTMLLoader |
|
from bs4 import SoupStrainer |
|
import re |
|
|
|
from langchain import HuggingFaceHub, PromptTemplate, LLMChain |
|
from langchain.embeddings import SentenceTransformerEmbeddings |
|
from langchain.vectorstores import Chroma |
|
|
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.memory import ChatMessageHistory, ConversationBufferMemory |
|
|
|
import chainlit as cl |
|
|
|
|
|
model_id = "tiiuae/falcon-7b-instruct" |
|
conv_model = HuggingFaceHub( |
|
huggingfacehub_api_token=os.environ['HF_API_TOKEN'], |
|
repo_id=model_id, |
|
model_kwargs={"temperature":0.8,"max_length": 1000} |
|
) |
|
|
|
|
|
data_path = "data/html" |
|
embed_model = "all-MiniLM-L6-v2" |
|
|
|
|
|
def load_documents(directory): |
|
|
|
|
|
bs_kwargs = { |
|
"features": "html.parser", |
|
"parse_only": SoupStrainer("p") |
|
} |
|
|
|
|
|
loader_kwargs = { |
|
"open_encoding": "utf-8", |
|
"bs_kwargs": bs_kwargs |
|
} |
|
|
|
|
|
loader = DirectoryLoader( |
|
path=directory, |
|
glob="*.html", |
|
loader_cls=BSHTMLLoader, |
|
loader_kwargs=loader_kwargs |
|
) |
|
|
|
documents = loader.load() |
|
return documents |
|
|
|
|
|
|
|
def prepare_documents(documents): |
|
for doc in documents: |
|
doc.page_content = doc.page_content.replace("\n", " ").replace("\t", " ") |
|
doc.page_content = re.sub("\\s+", " ", doc.page_content) |
|
|
|
|
|
bs_kwargs = { |
|
"features": "html.parser", |
|
"parse_only": SoupStrainer("title") |
|
} |
|
|
|
|
|
loader_kwargs = { |
|
"open_encoding": "utf-8", |
|
"bs_kwargs": bs_kwargs |
|
} |
|
|
|
loader = DirectoryLoader( |
|
path=data_path, |
|
glob="*.html", |
|
loader_cls=BSHTMLLoader, |
|
loader_kwargs=loader_kwargs |
|
) |
|
|
|
document_sources = loader.load() |
|
|
|
|
|
source_list = [doc.metadata["title"] for doc in document_sources] |
|
|
|
|
|
i = 0 |
|
for doc in documents: |
|
doc.metadata["source"] = " ".join(["FAR", source_list[i]]) |
|
i += 1 |
|
return documents |
|
|
|
@cl.on_chat_start |
|
async def on_chat_start(): |
|
|
|
embedding_func = SentenceTransformerEmbeddings(model_name=embed_model) |
|
|
|
msg = cl.Message( |
|
content="Loading and processing documents. This may take a while...", |
|
disable_human_feedback=True) |
|
await msg.send() |
|
|
|
documents = load_documents(data_path) |
|
documents = prepare_documents(documents) |
|
|
|
docsearch = await cl.make_async(Chroma.from_documents)( |
|
documents, |
|
embedding_func |
|
) |
|
|
|
message_history = ChatMessageHistory() |
|
|
|
memory = ConversationBufferMemory( |
|
memory_key="chat_history", |
|
output_key="answer", |
|
chat_memory=message_history, |
|
return_messages=True, |
|
) |
|
|
|
chain = ConversationalRetrievalChain.from_llm( |
|
conv_model, |
|
chain_type="stuff", |
|
retriever=docsearch.as_retriever(), |
|
memory=memory, |
|
return_source_documents=True, |
|
) |
|
msg.content = "Ready. You can now ask questions!" |
|
|
|
await msg.update() |
|
cl.user_session.set("chain", chain) |
|
|
|
|
|
@cl.on_message |
|
async def main(message): |
|
chain = cl.user_session.get("chain") |
|
cb = cl.AsyncLangchainCallbackHandler() |
|
|
|
res = await chain.acall(message.content, callbacks=[cb]) |
|
|
|
answer = res["answer"] |
|
source_documents = res["source_documents"] |
|
|
|
text_elements = [] |
|
|
|
source_names = set() |
|
|
|
for idx, source_doc in enumerate(source_documents): |
|
source_name = source_doc.metadata["source"] |
|
text_elements.append( |
|
cl.Text(content=source_doc.page_content, |
|
name=source_name)) |
|
source_names.add(source_name) |
|
|
|
if source_names: |
|
answer += f"\nSources: {', '.join(source_names)}" |
|
else: |
|
answer += "\nNo sources found" |
|
|
|
await cl.Message(content=answer, elements=text_elements).send() |