DocuChat_2 / DocuChat.py
mckplus's picture
Update DocuChat.py
23237f8
raw
history blame
No virus
3.81 kB
import os
import re
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI as LangchainOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import panel as pn
# Include Lato font
lato_font_link = "<link href='https://fonts.googleapis.com/css2?family=Lato:wght@400;700&display=swap' rel='stylesheet'>"
pn.config.raw_css.append(lato_font_link)
# Custom CSS to use Lato font
pn.config.raw_css.append("""
.bk, .bk-root, .bk-widget {
font-family: 'Lato', sans-serif !important;
}
.mckenzie-link a {
font-weight: bold;
color: #1b9aaa;
}
""")
# Set global sizing mode
pn.config.sizing_mode = 'stretch_width'
# Panel extension
pn.extension()
class LangchainConversation:
def __init__(self):
self.file_input = pn.widgets.FileInput(height=45)
self.openaikey = pn.widgets.PasswordInput(value="", placeholder="Enter your OpenAI API Key here...", height=45)
self.chatbox = pn.widgets.ChatBox(height=300, primary_name="User")
self.chatbox.param.watch(self._chat, 'value')
self.chat_history = [] # Chat history to store previous queries and responses
def _chat(self, event):
user_message = event.new[-1]
input = user_message.get("User")
if input is None:
return
os.environ["OPENAI_API_KEY"] = self.openaikey.value
if self.file_input.value is not None:
self.file_input.save("/.cache/temp.pdf")
prompt_text = self.remove_empty_lines(input)
if prompt_text:
result = self.qa(file="/.cache/temp.pdf", query=prompt_text)
self.chatbox.append({"AI": result})
@staticmethod
def remove_empty_lines(text):
lines = re.split(r'\\r\\n|\\r|\\n', text)
return '\\n'.join([line.strip() for line in lines if line.strip()])
def qa(self, file, query):
# Consider chat history when processing new queries
chat_history_str = "\\n".join([f"User: {q}\\nAI: {a}" for q, a in self.chat_history])
# Load, split, and analyze the document using the default text splitter
loader = PyPDFLoader(file)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # Default text splitting
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
qa = RetrievalQA.from_chain_type(llm=LangchainOpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)
result = qa({"query": query + "\\n" + chat_history_str})
# Update chat history
self.chat_history.append((query, result['result']))
return result['result']
def view(self):
layout = pn.Column(
pn.pane.Markdown("""
# DocuChat
AI-Powered Query Engine for Document Insights (powered by LangChain & OpenAI)
## How it works:
1) Upload a PDF
2) Enter your OpenAI API key (get one via [OpenAI](https://platform.openai.com/account))
3) Type a question and your document will get analyzed for an answer
Built by <span class="mckenzie-link">[McKenzie](https://www.mckenzielloydsmith.com/home?utm_source=HuggingFace&utm_medium=PDF+Analyzer)</span>.
"""),
pn.Row(self.file_input, self.openaikey), self.chatbox
).servable()
return layout
langchain_conversation = LangchainConversation()
langchain_conversation.view()