Spaces:
Sleeping
Sleeping
File size: 4,887 Bytes
86e0637 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import PyPDF2
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
import chainlit as cl
from chainlit.input_widget import Select
import os
@cl.cache
def get_memory():
# Initialize message history for conversation
message_history = ChatMessageHistory()
# Memory for conversational context
memory = ConversationBufferMemory(
memory_key="chat_history",
output_key="answer",
chat_memory=message_history,
return_messages=True,
)
return memory
@cl.on_chat_start
async def on_chat_start():
user_env = cl.user_session.get("env")
os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
settings = await cl.ChatSettings(
[
Select(
id="Model",
label="Open Source Model",
values=["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it"],
initial_index=0,
)
]
).send()
files = None #Initialize variable to store uploaded files
# Wait for the user to upload a file
while files is None:
files = await cl.AskFileMessage(
content="Please upload a pdf file to begin!",
accept=["application/pdf"],
max_size_mb=100,
timeout=180,
).send()
file = files[0] # Get the first uploaded file
# Inform the user that processing has started
msg = cl.Message(content=f"Processing `{file.name}`...")
await msg.send()
# Read the PDF file
pdf = PyPDF2.PdfReader(file.path)
pdf_text = ""
for page in pdf.pages:
pdf_text += page.extract_text()
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(pdf_text)
# Create a metadata for each chunk
metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
# Create a Chroma vector store
embeddings = OllamaEmbeddings(model="nomic-embed-text")
#embeddings = OllamaEmbeddings(model="llama2:7b")
docsearch = await cl.make_async(Chroma.from_texts)(
texts, embeddings, metadatas=metadatas, persist_directory='./chroma_db'
)
docsearch.persist()
# Let the user know that the system is ready
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
await msg.update()
await setup_agent(settings)
@cl.on_settings_update
async def setup_agent(settings):
print("Setup agent with settings:", settings)
user_env = cl.user_session.get("env")
os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
embeddings = OllamaEmbeddings(model="nomic-embed-text")
memory=get_memory()
docsearch = await cl.make_async(Chroma)(
persist_directory="./chroma_db",
embedding_function=embeddings
)
# Create a chain that uses the Chroma vector store
chain = ConversationalRetrievalChain.from_llm(
llm = ChatGroq(model=settings["Model"]),
chain_type="stuff",
retriever=docsearch.as_retriever(),
memory=memory,
return_source_documents=True,
)
#store the chain in user session
cl.user_session.set("chain", chain)
@cl.on_message
async def main(message: cl.Message):
# Retrieve the chain from user session
chain = cl.user_session.get("chain")
#call backs happens asynchronously/parallel
cb = cl.AsyncLangchainCallbackHandler()
user_env = cl.user_session.get("env")
os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
print(chain)
# call the chain with user's message content
res = await chain.ainvoke(message.content, callbacks=[cb])
answer = res["answer"]
source_documents = res["source_documents"]
text_elements = [] # Initialize list to store text elements
# Process source documents if available
if source_documents:
for source_idx, source_doc in enumerate(source_documents):
source_name = f"source_{source_idx}"
# Create the text element referenced in the message
text_elements.append(
cl.Text(content=source_doc.page_content, name=source_name)
)
source_names = [text_el.name for text_el in text_elements]
# Add source references to the answer
if source_names:
answer += f"\nSources: {', '.join(source_names)}"
else:
answer += "\nNo sources found"
#return results
await cl.Message(content=answer, elements=text_elements).send() |