Spaces:
Runtime error
Runtime error
File size: 2,831 Bytes
611aebd 9c2548e 13b16b6 611aebd 9c2548e 611aebd 9c2548e 13b16b6 9c2548e 13b16b6 9c2548e 13b16b6 ee30e14 13b16b6 9c2548e 373316d 9c2548e 373316d 13b16b6 9c2548e 611aebd 9c2548e 13b16b6 9c2548e 13b16b6 611aebd ee30e14 9c2548e ee30e14 13b16b6 611aebd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import gradio as gr
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SVMRetriever
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
def load_data():
# load the documents
loader = DirectoryLoader('./data', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
docs = loader.load()
# replace all new lines with spaces
[setattr(doc, "page_content", doc.page_content.replace("\n", " ")) for doc in docs]
print(docs)
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
all_splits = text_splitter.split_documents(docs)
# construct vector store
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
# https://python.langchain.com/docs/use_cases/question_answering.html#go-deeper-3
svm_retriever = SVMRetriever.from_documents(all_splits, OpenAIEmbeddings())
return svm_retriever, vectorstore
svm_retriever, vectorstore = load_data()
def process_question(question, history, svm_retriever=svm_retriever, vectorstore=vectorstore):
docs_svm=svm_retriever.get_relevant_documents(question)
print(len(docs_svm))
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), return_source_documents=True)
result = qa_chain({"query": question})
output = f"""============RESULT==============
\n
{result["result"]}
\n
============SOURCES=============
"""
# Initialize an empty list to hold the lines
lines = []
source_docs = [(x.metadata["source"], x.page_content) for x in result["source_documents"]]
for i, doc in enumerate(source_docs):
lines.append(f"* CHUNK: {i} *")
lines.append(f"original doc: {doc[0]}")
lines.append(f"{doc[1]}")
lines.append('') # for a newline between chunks
# Join the lines with a newline character to get the multi-line string
output += '\n'.join(lines)
return output
iface = gr.ChatInterface(
title="Internal DOC QA",
theme=gr.themes.Soft,
fn=process_question, # the function to wrap
# inputs="text", # the input type
# outputs="text", # the output type
examples=[
[f"what is the process of raising an incident?"],
[f"What is Cx0 program management?"],
[
f"What is process for identifying risksthat can impact the desired outcomes of a project?"
],
[f"What is the release management process?"],
],
)
if __name__ == "__main__":
iface.launch()
|