Spaces:

snrspeaks
/

ChatPDF-Chainlit

Running

App Files Files Community

Shivanand Roy commited on Jun 30, 2023

Commit

bc69d30

•

1 Parent(s): f98d774

Added application file

Browse files

Files changed (9) hide show

Dockerfile +11 -0
app.py +145 -0
assets/ChatPDF.jpg +0 -0
assets/ChatPDFAvatar.jpg +0 -0
assets/ChatPDFAvatar.png +0 -0
assets/ChatPDFLogo.png +0 -0
assets/ChatPDFLogoV2.png +0 -0
assets/avatar.jpg +0 -0
chainlit.md +19 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.memory import ConversationBufferWindowMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.document_loaders import PyPDFLoader
+import os
+import chainlit as cl
+from langchain.prompts import PromptTemplate
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+system_template = """Use the following pieces of context to answer the users question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+ALWAYS return a "SOURCES" part in your answer.
+The "SOURCES" part should be a reference to the source of the document from which you got your answer.
+Example of your response should be:
+```
+The answer is foo
+SOURCES: xyz
+```
+Begin!
+----------------
+{summaries}"""
+messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+prompt = ChatPromptTemplate.from_messages(messages)
+chain_type_kwargs = {"prompt": prompt}
+@cl.on_chat_start
+async def start():
+    await cl.Avatar(
+        name="ChatPDF",
+        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
+        # path = r'assets/ChatPDFAvatar.jpg'
+    ).send()
+@cl.langchain_factory(use_async=True)
+async def init():
+    files = None
+    # Wait for the user to upload a file
+    while files == None:
+        files = await cl.AskFileMessage(
+            content="Hey, Welcome to ChatPDF!\n\nChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.\n\nSimply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.\n\nPlease upload a PDF file to begin!", accept=["application/pdf"]
+        ).send()
+    file = files[0]
+    msg = cl.Message(content=f'''Processing "{file.name}"...''')
+    await msg.send()
+    #
+    with open(os.path.join(file.name), "wb") as f:
+        f.write(file.content)
+    print(file.name)
+    loader = PyPDFLoader(file.name)
+    pages = loader.load_and_split()
+    # add page split info
+    # Initialize a dictionary to keep track of duplicate page numbers
+    page_counts = {}
+    for document in pages:
+        page_number = document.metadata['page']
+        # If this is the first occurrence of this page number, initialize its count to 1
+        # Otherwise, increment the count for this page number
+        page_counts[page_number] = page_counts.get(page_number, 0) + 1
+        # Create the page split info string
+        page_split_info = f"Page-{page_number+1}.{page_counts[page_number]}"
+        # Add the page split info to the document's metadata
+        document.metadata['page_split_info'] = page_split_info
+    # Create a Chroma vector store
+    embeddings = OpenAIEmbeddings()
+    docsearch = await cl.make_async(Chroma.from_documents)(
+        pages, embeddings
+    )
+    # define memory
+    memory = ConversationBufferWindowMemory(
+        k=5,
+        memory_key='chat_history',
+        return_messages=True,
+        output_key='answer'
+        )
+    # Create a chain that uses the Chroma vector store
+    chain = ConversationalRetrievalChain.from_llm(
+        ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", streaming=True),
+        chain_type="stuff",
+        retriever=docsearch.as_retriever(search_kwargs={'k':5}),
+        memory=memory,
+        return_source_documents=True,
+    )
+    # Save the metadata and texts in the user session
+    # cl.user_session.set("metadatas", metadatas)
+    cl.user_session.set("texts", pages)
+    # Let the user know that the system is ready
+    await msg.update(content=f''' "{file.name}" processed. You can now ask questions!''')
+    return chain
+@cl.langchain_postprocess
+async def process_response(res):
+    answer = res["answer"]
+    source_documents = res['source_documents']
+    content = [source_documents[i].page_content for i in range(len(source_documents))]
+    name = [source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))]
+    source_elements = [
+        cl.Text(content=content[i], name=name[i]) for i in range(len(source_documents))
+        ]
+    if source_documents:
+        answer += f"\n\nSources: {', '.join([source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))])}"
+    else:
+        answer += "\n\nNo sources found"
+    await cl.Message(content=answer, elements=source_elements).send()
+    # await cl.Message(content=answer).send()

assets/ChatPDF.jpg ADDED Viewed

assets/ChatPDFAvatar.jpg ADDED Viewed

assets/ChatPDFAvatar.png ADDED Viewed

assets/ChatPDFLogo.png ADDED Viewed

assets/ChatPDFLogoV2.png ADDED Viewed

assets/avatar.jpg ADDED Viewed

chainlit.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# ChatPDF - Conversation Style Question Answering with PDFs
+ChatPDF is an application designed to provide super fast, conversation-style question answering from any PDF documents.
+![](assets/ChatPDF.jpg)
+ChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.
+Simply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.
+## Features
+1. PDF text extraction: ChatPDF accepts PDF files, from which it extracts the text in an intelligent manner.
+2. Document chunking: The extracted text is split into manageable chunks, enabling efficient processing.
+3. Text embedding: The chunks are embedded using sophisticated natural language processing techniques and stored in a vector database.
+4. Fast querying: Upon user query, the application swiftly identifies the most relevant document chunks from the vector database.
+5. AI-powered answers: The selected chunks are then passed to a large language model (LLM), which generates detailed, coherent responses.