File size: 2,764 Bytes
611aebd
9c2548e
 
 
 
 
 
13b16b6
611aebd
 
9c2548e
 
 
 
 
 
 
611aebd
9c2548e
 
 
13b16b6
9c2548e
 
 
 
 
13b16b6
9c2548e
13b16b6
9c2548e
13b16b6
9c2548e
 
 
 
 
 
 
 
 
 
 
 
 
13b16b6
9c2548e
 
611aebd
9c2548e
 
 
 
 
 
13b16b6
9c2548e
 
 
13b16b6
611aebd
 
9c2548e
611aebd
 
13b16b6
 
 
 
 
 
 
 
611aebd
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SVMRetriever
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


def load_data():
    # load the documents
    loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
    docs = loader.load()
    # replace all new lines with spaces
    [setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
    print(docs)

    # split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
    all_splits = text_splitter.split_documents(docs)

    # construct vector store
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
    # https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
    svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
    return svm_retriever, vectorstore

svm_retriever, vectorstore = load_data()

def process_question(question, svm_retriever=svm_retriever, vectorstore=vectorstore):

    docs_svm=svm_retriever.get_relevant_documents(question)
    print(len(docs_svm))
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
    result = qa_chain({"query": question})

    output = f"""
============RESULT==============
\n
{result["result"]}
\n
============SOURCES=============
    """

    # Initialize an empty list to hold the lines
    lines = []

    source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
    for i, doc in enumerate(source_docs):
        lines.append(f"* CHUNK: {i} *")
        lines.append(f"original doc: {doc[0]}")
        lines.append(f"{doc[1]}")
        lines.append('')  # for a newline between chunks

    # Join the lines with a newline character to get the multi-line string
    output += '\n'.join(lines)
    return output


iface = gr.Interface(
    fn=process_question,  # the function to wrap
    inputs="text",  # the input type
    outputs="text",  # the output type
    examples=[
        [f"what is the process of raising an incident?"],
        [f"What is Cx0 program management?"],
        [
            f"What is process for identifying risksthat can impact the desired outcomes of a project?"
        ],
        [f"What is the release management process?"],
    ],
)

if __name__ == "__main__":
    iface.launch()