Spaces:

shoshana-levitt
/

pdf-ingestor

Sleeping

App Files Files Community

shoshana-levitt commited on 10 days ago

Commit

09315ae

•

1 Parent(s): b806895

first commit

Browse files

Files changed (5) hide show

.gitignore +1 -0
Dockerfile +7 -0
app.py +132 -0
chainlit.md +1 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from langchain.document_loaders import PyPDFLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+import os
+import chainlit as cl
+import tempfile
+from dotenv import load_dotenv
+load_dotenv()
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+system_template = """ Try to find detailed information
+Begin!
+----------------
+{summaries}"""
+messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+prompt = ChatPromptTemplate.from_messages(messages)
+@cl.on_chat_start
+async def init():
+    files = None
+    # Wait for the user to upload a file
+    while files is None:
+        files = await cl.AskFileMessage(
+            content="Please upload a file to start chatting!", accept=["pdf"]
+        ).send()
+    file = files[0]
+    msg = cl.Message(content=f"Processing `{file.name}`...")
+    await msg.send()
+    with tempfile.NamedTemporaryFile(delete=False) as temp:
+        temp.write(file.content)
+        temp_path = temp.name
+    # Load the PDF using PyPDFLoader into an array of documents, where each document contains the page content and metadata with page number.
+    loader = PyPDFLoader(temp_path)
+    pages = loader.load_and_split()
+    # Combine the page content into a single text variable.
+    text = ' '.join([page.page_content for page in pages])
+    # Split the text into chunks
+    texts = text_splitter.split_text(text)
+    # Create a metadata for each chunk
+    metadatas = [{"source": f"{i}-word"} for i in range(len(texts))]
+    # Create a Chroma vector store
+    embeddings = OpenAIEmbeddings()
+    docsearch = await cl.make_async(Chroma.from_texts)(
+        texts, embeddings, metadatas=metadatas
+    )
+    # Create a chain that uses the Chroma vector store
+    chain = RetrievalQAWithSourcesChain.from_chain_type(
+        ChatOpenAI(temperature=0),
+        chain_type="stuff",
+        retriever=docsearch.as_retriever(),
+    )
+    # Save the metadata and texts in the user session
+    cl.user_session.set("metadatas", metadatas)
+    cl.user_session.set("texts", texts)
+    # Let the user know that the system is ready
+    msg.content = f"`{file.name}` processed. You can now ask questions!"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def process_response(message):
+    chain = cl.user_session.get("chain")
+    if chain is None:
+        await cl.Message(content="The system is not initialized. Please upload a PDF file first.").send()
+        return
+    # Use the chain to process the user's question
+    response = await chain.acall({
+        "question": message.content
+    })
+    answer = response["answer"]
+    sources = response["sources"].strip()
+    source_elements = []
+    # Get the metadata and texts from the user session
+    metadatas = cl.user_session.get("metadatas")
+    all_sources = [m["source"] for m in metadatas]
+    texts = cl.user_session.get("texts")
+    if sources:
+        found_sources = []
+        # Add the sources to the message
+        for source in sources.split(","):
+            source_name = source.strip().replace(".", "")
+            # Get the index of the source
+            try:
+                index = all_sources.index(source_name)
+            except ValueError:
+                continue
+            text = texts[index]
+            found_sources.append(source_name)
+            # Create the text element referenced in the message
+            source_elements.append(cl.Text(content=text, name=source_name))
+        if found_sources:
+            answer += f"\nSources: {', '.join(found_sources)}"
+        else:
+            answer += "\nNo sources found"
+    await cl.Message(content=answer, elements=source_elements).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Chatbot

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fastapi
2	+ uvicorn[standard]