Spaces:

shoshana-levitt
/

pdf-ingestor

Sleeping

App Files Files Community

pdf-ingestor / app.py

shoshana-levitt

edit Dockerfile

92a4682 4 months ago

raw

history blame

No virus

4.19 kB

	from fastapi import FastAPI
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain_community.chat_models import ChatOpenAI
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	HumanMessagePromptTemplate,
	)
	import os
	import chainlit as cl
	import tempfile
	from dotenv import load_dotenv

	load_dotenv()

	app = FastAPI()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

	system_template = """ Try to find detailed information

	Begin!
	----------------
	{summaries}"""

	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]

	prompt = ChatPromptTemplate.from_messages(messages)

	@cl.on_chat_start
	async def init():
	files = None

	# Wait for the user to upload a file
	while files is None:
	files = await cl.AskFileMessage(
	content="Please upload a file to start chatting!", accept=["pdf"]
	).send()

	file = files[0]

	msg = cl.Message(content=f"Processing `{file.name}`...")
	await msg.send()

	with tempfile.NamedTemporaryFile(delete=False) as temp:
	temp.write(file.content)
	temp_path = temp.name

	# Load the PDF using PyPDFLoader into an array of documents, where each document contains the page content and metadata with page number.
	loader = PyPDFLoader(temp_path)
	pages = loader.load_and_split()

	# Combine the page content into a single text variable.
	text = ' '.join([page.page_content for page in pages])

	# Split the text into chunks
	texts = text_splitter.split_text(text)

	# Create a metadata for each chunk
	metadatas = [{"source": f"{i}-word"} for i in range(len(texts))]

	# Create a Chroma vector store
	embeddings = OpenAIEmbeddings()
	docsearch = await cl.make_async(Chroma.from_texts)(
	texts, embeddings, metadatas=metadatas
	)

	# Create a chain that uses the Chroma vector store
	chain = RetrievalQAWithSourcesChain.from_chain_type(
	ChatOpenAI(temperature=0),
	chain_type="stuff",
	retriever=docsearch.as_retriever(),
	)

	# Save the metadata and texts in the user session
	cl.user_session.set("metadatas", metadatas)
	cl.user_session.set("texts", texts)

	# Let the user know that the system is ready
	msg.content = f"`{file.name}` processed. You can now ask questions!"
	await msg.update()

	cl.user_session.set("chain", chain)

	@cl.on_message
	async def process_response(message):
	chain = cl.user_session.get("chain")

	if chain is None:
	await cl.Message(content="The system is not initialized. Please upload a PDF file first.").send()
	return

	# Use the chain to process the user's question
	response = await chain.acall({
	"question": message.content
	})

	answer = response["answer"]
	sources = response["sources"].strip()
	source_elements = []

	# Get the metadata and texts from the user session
	metadatas = cl.user_session.get("metadatas")
	all_sources = [m["source"] for m in metadatas]
	texts = cl.user_session.get("texts")

	if sources:
	found_sources = []

	# Add the sources to the message
	for source in sources.split(","):
	source_name = source.strip().replace(".", "")
	# Get the index of the source
	try:
	index = all_sources.index(source_name)
	except ValueError:
	continue
	text = texts[index]
	found_sources.append(source_name)
	# Create the text element referenced in the message
	source_elements.append(cl.Text(content=text, name=source_name))

	if found_sources:
	answer += f"\nSources: {', '.join(found_sources)}"
	else:
	answer += "\nNo sources found"

	await cl.Message(content=answer, elements=source_elements).send()