CSAle's picture
Adding Initial App
f25b2b3
raw
history blame
3.99 kB
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
import os
import arxiv
import chainlit as cl
from chainlit import user_session
user_env = user_session.get("env")
system_template = """Use the following pieces of context to answer the users question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
Example of your response should be:
```
The answer is foo
SOURCES:
Title: xyz
Page Number: 1
URL: https://arxiv.org/abs/X.Y.Z
```
Begin!
----------------
{summaries}"""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
chain_type_kwargs = {"prompt": prompt}
@cl.langchain_factory
def init():
arxiv_query = None
# Wait for the user to ask an Arxiv question
while arxiv_query == None:
arxiv_query = cl.AskUserMessage(
content="Please enter a topic to begin!", timeout=15
).send()
# Obtain the top 30 results from Arxiv for the query
search = arxiv.Search(
query=arxiv_query["content"],
max_results=30,
sort_by=arxiv.SortCriterion.Relevance,
)
# download each of the pdfs
pdf_data = []
for result in search.results():
loader = PyMuPDFLoader(result.pdf_url)
loaded_pdf = loader.load()
for document in loaded_pdf:
document.metadata["source"] = result.entry_id
document.metadata["file_path"] = result.pdf_url
document.metadata["title"] = result.title
pdf_data.append(document)
# Create a Chroma vector store
embeddings = OpenAIEmbeddings(disallowed_special=())
docsearch = Chroma.from_documents(pdf_data, embeddings)
# Create a chain that uses the Chroma vector store
chain = RetrievalQAWithSourcesChain.from_chain_type(
ChatOpenAI(
model_name="gpt-4",
temperature=0,
openai_api_key=user_env.get("OPENAI_API_KEY"),
),
chain_type="stuff",
retriever=docsearch.as_retriever(),
return_source_documents=True,
)
# Let the user know that the system is ready
cl.Message(
content=f"We found a few papers about `{arxiv_query['content']}` you can now ask questions!"
).send()
return chain
@cl.langchain_postprocess
def process_response(res):
answer = res["answer"]
source_elements_dict = {}
source_elements = []
for idx, source in enumerate(res["source_documents"]):
title = source.metadata["title"]
if title not in source_elements_dict:
source_elements_dict[title] = {
"page_number": [source.metadata["page"]],
"url": source.metadata["file_path"],
}
else:
source_elements_dict[title]["page_number"].append(source.metadata["page"])
# sort the page numbers
source_elements_dict[title]["page_number"].sort()
for title, source in source_elements_dict.items():
# create a string for the page numbers
page_numbers = ", ".join([str(x) for x in source["page_number"]])
text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
source_elements.append(
cl.Text(name=title, text=text_for_source, display="inline")
)
cl.Message(content=answer, elements=source_elements).send()