Spaces:

smkerr
/

rag-chat

Runtime error

App Files Files Community

smkerr commited on Dec 13, 2023

Commit

6fa8438

•

1 Parent(s): 0a8e1e5

Upload rag-chat.py

Browse files

Files changed (1) hide show

rag-chat.py +159 -0

rag-chat.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+from langchain.document_loaders import DirectoryLoader
+from langchain.document_loaders import BSHTMLLoader
+from bs4 import SoupStrainer
+import re
+from langchain import HuggingFaceHub, PromptTemplate, LLMChain
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+import chainlit as cl
+# llm
+model_id = "tiiuae/falcon-7b-instruct"
+conv_model = HuggingFaceHub(
+    huggingfacehub_api_token=os.environ['HF_API_TOKEN'],
+    repo_id=model_id,
+    model_kwargs={"temperature":0.8,"max_length": 1000}
+    )
+# chroma
+data_path = "data/html"
+embed_model = "all-MiniLM-L6-v2" # Chroma defaults to "sentence-transformers/all-MiniLM-L6-v2"
+# load documents
+def load_documents(directory):
+    # define Beautiful Soup key word args
+    bs_kwargs = {
+        "features": "html.parser",
+        "parse_only": SoupStrainer("p") # only include relevant text
+        }
+    # define Loader key word args
+    loader_kwargs = {
+        "open_encoding": "utf-8",
+        "bs_kwargs": bs_kwargs
+        }
+    # define Loader
+    loader = DirectoryLoader(
+        path=directory,
+        glob="*.html",
+        loader_cls=BSHTMLLoader,
+        loader_kwargs=loader_kwargs
+        )
+    documents = loader.load()
+    return documents
+# prepare documents
+def prepare_documents(documents):
+    for doc in documents:
+        doc.page_content = doc.page_content.replace("\n", " ").replace("\t", " ")
+        doc.page_content = re.sub("\\s+", " ", doc.page_content)
+    # define Beautiful Soup key word args
+    bs_kwargs = {
+        "features": "html.parser",
+        "parse_only": SoupStrainer("title") # only include relevant text
+        }
+    # define Loader key word args
+    loader_kwargs = {
+        "open_encoding": "utf-8",
+        "bs_kwargs": bs_kwargs
+        }
+    loader = DirectoryLoader(
+        path=data_path,
+        glob="*.html",
+        loader_cls=BSHTMLLoader,
+        loader_kwargs=loader_kwargs
+        )
+    document_sources = loader.load()
+    # convert source metadata into a list
+    source_list = [doc.metadata["title"] for doc in document_sources]
+    # update source metadata
+    i = 0
+    for doc in documents:
+        doc.metadata["source"] = " ".join(["FAR", source_list[i]])
+        i += 1
+    return documents
+@cl.on_chat_start
+async def on_chat_start():
+    # Instantiate the chain for that user session
+    embedding_func = SentenceTransformerEmbeddings(model_name=embed_model)
+    msg = cl.Message(
+        content="Loading and processing documents. This may take a while...",
+        disable_human_feedback=True)
+    await msg.send()
+    documents = load_documents(data_path)
+    documents = prepare_documents(documents)
+    docsearch = await cl.make_async(Chroma.from_documents)(
+        documents,
+        embedding_func
+    )
+    message_history = ChatMessageHistory()
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        output_key="answer",
+        chat_memory=message_history,
+        return_messages=True,
+    )
+    chain = ConversationalRetrievalChain.from_llm(
+        conv_model,
+        chain_type="stuff",
+        retriever=docsearch.as_retriever(),
+        memory=memory,
+        return_source_documents=True,
+    )
+    msg.content = "Ready. You can now ask questions!"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def main(message):
+    chain = cl.user_session.get("chain")  # type: ConversationalRetrievalChain
+    cb = cl.AsyncLangchainCallbackHandler()
+    res = await chain.acall(message.content, callbacks=[cb])
+    answer = res["answer"]
+    source_documents = res["source_documents"]
+    text_elements = []
+    source_names = set()  # Use a set to store unique source names
+    for idx, source_doc in enumerate(source_documents):
+        source_name = source_doc.metadata["source"]
+        text_elements.append(
+                cl.Text(content=source_doc.page_content,
+                        name=source_name))
+        source_names.add(source_name)  # Add the source name to the set
+    if source_names:
+            answer += f"\nSources: {', '.join(source_names)}"
+    else:
+            answer += "\nNo sources found"
+    await cl.Message(content=answer, elements=text_elements).send()