pdf-ingestor / app.py
shoshana-levitt's picture
first commit
09315ae
raw
history blame
No virus
4.11 kB
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
import os
import chainlit as cl
import tempfile
from dotenv import load_dotenv
load_dotenv()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
system_template = """ Try to find detailed information
Begin!
----------------
{summaries}"""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
@cl.on_chat_start
async def init():
files = None
# Wait for the user to upload a file
while files is None:
files = await cl.AskFileMessage(
content="Please upload a file to start chatting!", accept=["pdf"]
).send()
file = files[0]
msg = cl.Message(content=f"Processing `{file.name}`...")
await msg.send()
with tempfile.NamedTemporaryFile(delete=False) as temp:
temp.write(file.content)
temp_path = temp.name
# Load the PDF using PyPDFLoader into an array of documents, where each document contains the page content and metadata with page number.
loader = PyPDFLoader(temp_path)
pages = loader.load_and_split()
# Combine the page content into a single text variable.
text = ' '.join([page.page_content for page in pages])
# Split the text into chunks
texts = text_splitter.split_text(text)
# Create a metadata for each chunk
metadatas = [{"source": f"{i}-word"} for i in range(len(texts))]
# Create a Chroma vector store
embeddings = OpenAIEmbeddings()
docsearch = await cl.make_async(Chroma.from_texts)(
texts, embeddings, metadatas=metadatas
)
# Create a chain that uses the Chroma vector store
chain = RetrievalQAWithSourcesChain.from_chain_type(
ChatOpenAI(temperature=0),
chain_type="stuff",
retriever=docsearch.as_retriever(),
)
# Save the metadata and texts in the user session
cl.user_session.set("metadatas", metadatas)
cl.user_session.set("texts", texts)
# Let the user know that the system is ready
msg.content = f"`{file.name}` processed. You can now ask questions!"
await msg.update()
cl.user_session.set("chain", chain)
@cl.on_message
async def process_response(message):
chain = cl.user_session.get("chain")
if chain is None:
await cl.Message(content="The system is not initialized. Please upload a PDF file first.").send()
return
# Use the chain to process the user's question
response = await chain.acall({
"question": message.content
})
answer = response["answer"]
sources = response["sources"].strip()
source_elements = []
# Get the metadata and texts from the user session
metadatas = cl.user_session.get("metadatas")
all_sources = [m["source"] for m in metadatas]
texts = cl.user_session.get("texts")
if sources:
found_sources = []
# Add the sources to the message
for source in sources.split(","):
source_name = source.strip().replace(".", "")
# Get the index of the source
try:
index = all_sources.index(source_name)
except ValueError:
continue
text = texts[index]
found_sources.append(source_name)
# Create the text element referenced in the message
source_elements.append(cl.Text(content=text, name=source_name))
if found_sources:
answer += f"\nSources: {', '.join(found_sources)}"
else:
answer += "\nNo sources found"
await cl.Message(content=answer, elements=source_elements).send()