import gradio as gr from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.retrievers import SVMRetriever from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI def load_data(): # load the documents loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader) docs = loader.load() # replace all new lines with spaces [setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs] print(docs) # split the documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50) all_splits = text_splitter.split_documents(docs) # construct vector store vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings()) # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3 svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings()) return svm_retriever, vectorstore svm_retriever, vectorstore = load_data() def process_question(question, history, svm_retriever=svm_retriever, vectorstore=vectorstore): docs_svm=svm_retriever.get_relevant_documents(question) print(len(docs_svm)) llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True) result = qa_chain({"query": question}) output = f"""============RESULT============== \n {result["result"]} \n ============SOURCES============= """ # Initialize an empty list to hold the lines lines = [] source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]] for i, doc in enumerate(source_docs): lines.append(f"* CHUNK: {i} *") lines.append(f"original doc: {doc[0]}") lines.append(f"{doc[1]}") lines.append('') # for a newline between chunks # Join the lines with a newline character to get the multi-line string output += '\n'.join(lines) return output iface = gr.ChatInterface( title="Internal DOC QA", theme=gr.themes.Soft, fn=process_question, # the function to wrap # inputs="text", # the input type # outputs="text", # the output type examples=[ [f"what is the process of raising an incident?"], [f"What is Cx0 program management?"], [ f"What is process for identifying risksthat can impact the desired outcomes of a project?" ], [f"What is the release management process?"], ], ) if __name__ == "__main__": iface.launch()