Spaces:

drift-ai
/

internal-document-qa

Runtime error

File size: 2,764 Bytes

611aebd
9c2548e
 
 
 
 
 
13b16b6
611aebd
 
9c2548e
 
 
 
 
 
 
611aebd
9c2548e
 
 
13b16b6
9c2548e
 
 
 
 
13b16b6
9c2548e
13b16b6
9c2548e
13b16b6
9c2548e
 
 
 
 
 
 
 
 
 
 
 
 
13b16b6
9c2548e
 
611aebd
9c2548e
 
 
 
 
 
13b16b6
9c2548e
 
 
13b16b6
611aebd
 
9c2548e
611aebd
 
13b16b6
 
 
 
 
 
 
 
611aebd

import gradio as gr
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SVMRetriever
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


def load_data():
    # load the documents
    loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
    docs = loader.load()
    # replace all new lines with spaces
    [setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
    print(docs)

    # split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
    all_splits = text_splitter.split_documents(docs)

    # construct vector store
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
    # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
    svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
    return svm_retriever, vectorstore

svm_retriever, vectorstore = load_data()

def process_question(question, svm_retriever=svm_retriever, vectorstore=vectorstore):

    docs_svm=svm_retriever.get_relevant_documents(question)
    print(len(docs_svm))
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
    result = qa_chain({"query": question})

    output = f"""
============RESULT==============
\n
{result["result"]}
\n
============SOURCES=============
    """

    # Initialize an empty list to hold the lines
    lines = []

    source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
    for i, doc in enumerate(source_docs):
        lines.append(f"* CHUNK: {i} *")
        lines.append(f"original doc: {doc[0]}")
        lines.append(f"{doc[1]}")
        lines.append('')  # for a newline between chunks

    # Join the lines with a newline character to get the multi-line string
    output += '\n'.join(lines)
    return output


iface = gr.Interface(
    fn=process_question,  # the function to wrap
    inputs="text",  # the input type
    outputs="text",  # the output type
    examples=[
        [f"what is the process of raising an incident?"],
        [f"What is Cx0 program management?"],
        [
            f"What is process for identifying risksthat can impact the desired outcomes of a project?"
        ],
        [f"What is the release management process?"],
    ],
)

if __name__ == "__main__":
    iface.launch()