Shivanand Roy commited on
Commit
bc69d30
1 Parent(s): f98d774

Added application file

Browse files
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings.openai import OpenAIEmbeddings
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.chains import RetrievalQAWithSourcesChain
5
+ from langchain.memory import ConversationBufferWindowMemory
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.prompts.chat import (
9
+ ChatPromptTemplate,
10
+ SystemMessagePromptTemplate,
11
+ HumanMessagePromptTemplate,
12
+ )
13
+ from langchain.document_loaders import PyPDFLoader
14
+ import os
15
+ import chainlit as cl
16
+ from langchain.prompts import PromptTemplate
17
+
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
19
+
20
+ system_template = """Use the following pieces of context to answer the users question.
21
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
22
+ ALWAYS return a "SOURCES" part in your answer.
23
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
24
+
25
+ Example of your response should be:
26
+
27
+ ```
28
+ The answer is foo
29
+ SOURCES: xyz
30
+ ```
31
+
32
+ Begin!
33
+ ----------------
34
+ {summaries}"""
35
+ messages = [
36
+ SystemMessagePromptTemplate.from_template(system_template),
37
+ HumanMessagePromptTemplate.from_template("{question}"),
38
+ ]
39
+ prompt = ChatPromptTemplate.from_messages(messages)
40
+ chain_type_kwargs = {"prompt": prompt}
41
+
42
+ @cl.on_chat_start
43
+ async def start():
44
+ await cl.Avatar(
45
+ name="ChatPDF",
46
+ url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
47
+ # path = r'assets/ChatPDFAvatar.jpg'
48
+ ).send()
49
+
50
+
51
+ @cl.langchain_factory(use_async=True)
52
+ async def init():
53
+ files = None
54
+
55
+ # Wait for the user to upload a file
56
+ while files == None:
57
+ files = await cl.AskFileMessage(
58
+ content="Hey, Welcome to ChatPDF!\n\nChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.\n\nSimply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.\n\nPlease upload a PDF file to begin!", accept=["application/pdf"]
59
+ ).send()
60
+
61
+ file = files[0]
62
+
63
+ msg = cl.Message(content=f'''Processing "{file.name}"...''')
64
+ await msg.send()
65
+
66
+ #
67
+
68
+ with open(os.path.join(file.name), "wb") as f:
69
+ f.write(file.content)
70
+
71
+ print(file.name)
72
+
73
+ loader = PyPDFLoader(file.name)
74
+ pages = loader.load_and_split()
75
+
76
+ # add page split info
77
+ # Initialize a dictionary to keep track of duplicate page numbers
78
+ page_counts = {}
79
+
80
+ for document in pages:
81
+ page_number = document.metadata['page']
82
+
83
+ # If this is the first occurrence of this page number, initialize its count to 1
84
+ # Otherwise, increment the count for this page number
85
+ page_counts[page_number] = page_counts.get(page_number, 0) + 1
86
+
87
+ # Create the page split info string
88
+ page_split_info = f"Page-{page_number+1}.{page_counts[page_number]}"
89
+
90
+ # Add the page split info to the document's metadata
91
+ document.metadata['page_split_info'] = page_split_info
92
+
93
+
94
+
95
+ # Create a Chroma vector store
96
+ embeddings = OpenAIEmbeddings()
97
+ docsearch = await cl.make_async(Chroma.from_documents)(
98
+ pages, embeddings
99
+ )
100
+
101
+ # define memory
102
+ memory = ConversationBufferWindowMemory(
103
+ k=5,
104
+ memory_key='chat_history',
105
+ return_messages=True,
106
+ output_key='answer'
107
+ )
108
+
109
+ # Create a chain that uses the Chroma vector store
110
+ chain = ConversationalRetrievalChain.from_llm(
111
+ ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", streaming=True),
112
+ chain_type="stuff",
113
+ retriever=docsearch.as_retriever(search_kwargs={'k':5}),
114
+ memory=memory,
115
+ return_source_documents=True,
116
+ )
117
+
118
+ # Save the metadata and texts in the user session
119
+ # cl.user_session.set("metadatas", metadatas)
120
+ cl.user_session.set("texts", pages)
121
+
122
+ # Let the user know that the system is ready
123
+ await msg.update(content=f''' "{file.name}" processed. You can now ask questions!''')
124
+
125
+
126
+ return chain
127
+
128
+
129
+ @cl.langchain_postprocess
130
+ async def process_response(res):
131
+ answer = res["answer"]
132
+ source_documents = res['source_documents']
133
+ content = [source_documents[i].page_content for i in range(len(source_documents))]
134
+ name = [source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))]
135
+ source_elements = [
136
+ cl.Text(content=content[i], name=name[i]) for i in range(len(source_documents))
137
+ ]
138
+
139
+ if source_documents:
140
+ answer += f"\n\nSources: {', '.join([source_documents[i].metadata['page_split_info'] for i in range(len(source_documents))])}"
141
+ else:
142
+ answer += "\n\nNo sources found"
143
+
144
+ await cl.Message(content=answer, elements=source_elements).send()
145
+ # await cl.Message(content=answer).send()
assets/ChatPDF.jpg ADDED
assets/ChatPDFAvatar.jpg ADDED
assets/ChatPDFAvatar.png ADDED
assets/ChatPDFLogo.png ADDED
assets/ChatPDFLogoV2.png ADDED
assets/avatar.jpg ADDED
chainlit.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ChatPDF - Conversation Style Question Answering with PDFs
2
+
3
+ ChatPDF is an application designed to provide super fast, conversation-style question answering from any PDF documents.
4
+
5
+ ![](assets/ChatPDF.jpg)
6
+
7
+ ChatPDF is a smart, user-friendly tool that integrates state-of-the-art AI models with text extraction and embedding capabilities to create a unique, conversational interaction with your PDF documents.
8
+
9
+ Simply upload your PDF, ask your questions, and ChatPDF will deliver the most relevant answers directly from your document.
10
+
11
+
12
+ ## Features
13
+
14
+ 1. PDF text extraction: ChatPDF accepts PDF files, from which it extracts the text in an intelligent manner.
15
+ 2. Document chunking: The extracted text is split into manageable chunks, enabling efficient processing.
16
+ 3. Text embedding: The chunks are embedded using sophisticated natural language processing techniques and stored in a vector database.
17
+ 4. Fast querying: Upon user query, the application swiftly identifies the most relevant document chunks from the vector database.
18
+ 5. AI-powered answers: The selected chunks are then passed to a large language model (LLM), which generates detailed, coherent responses.
19
+