Pragatheeswaran-21
first comit
12a07be
import os
import shutil
import streamlit as st
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Together
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
os.environ["TOGETHER_API_KEY"] =os.getenv("TOGETHER_API_KEY")
def inference(chain, input_query):
"""Invoke the processing chain with the input query."""
result = chain.invoke(input_query)
return result
def create_chain(retriever, prompt, model):
"""Compose the processing chain with the specified components."""
chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| model
| StrOutputParser()
)
return chain
def generate_prompt():
"""Define the prompt template for question answering."""
template = """<s>[INST] Answer the question in a simple sentence based only on the following context:
{context}
Question: {question} [/INST]
"""
return ChatPromptTemplate.from_template(template)
def configure_model():
"""Configure the language model with specified parameters."""
return Together(
model="mistralai/Mixtral-8x7B-Instruct-v0.1",
temperature=0.1,
max_tokens=3000,
top_k=50,
top_p=0.7,
repetition_penalty=1.1,
)
def configure_retriever(pdf_loader):
"""Configure the retriever with embeddings and a FAISS vector store."""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(pdf_loader, embeddings)
return vector_db.as_retriever()
def load_documents(path):
"""Load and preprocess documents from PDF files located at the specified path."""
pdf_loader = []
for file in os.listdir(path):
if file.endswith('.pdf'):
filepath = os.path.join(path, file)
loader = UnstructuredPDFLoader(filepath)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=18000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)
pdf_loader.extend(docs)
return pdf_loader
def process_document(path, input_query):
"""Process the document by setting up the chain and invoking it with the input query."""
pdf_loader = load_documents(path)
llm_model = configure_model()
prompt = generate_prompt()
retriever = configure_retriever(pdf_loader)
chain = create_chain(retriever, prompt, llm_model)
response = inference(chain, input_query)
return response
def main():
"""Main function to run the Streamlit app."""
tmp_folder = '/tmp/1'
os.makedirs(tmp_folder,exist_ok=True)
st.title("Document Q&A Chatbot")
uploaded_files = st.sidebar.file_uploader("Choose PDF files", accept_multiple_files=True, type='pdf')
if uploaded_files:
for file in uploaded_files:
with open(os.path.join(tmp_folder, file.name), 'wb') as f:
f.write(file.getbuffer())
st.success('File successfully uploaded. Start prompting!')
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if uploaded_files:
with st.form(key='question_form'):
user_query = st.text_input("Ask a question:", key="query_input")
if st.form_submit_button("Ask") and user_query:
response = process_document(tmp_folder, user_query)
st.session_state.chat_history.append({"question": user_query, "answer": response})
if st.button("Clear Chat History"):
st.session_state.chat_history = []
for chat in st.session_state.chat_history:
st.markdown(f"**Q:** {chat['question']}")
st.markdown(f"**A:** {chat['answer']}")
st.markdown("---")
else:
st.success('Upload Document to Start Process !')
if st.sidebar.button("REMOVE UPLOADED FILES"):
document_count = os.listdir(tmp_folder)
if len(document_count) > 0:
shutil.rmtree(tmp_folder)
st.sidebar.write("FILES DELETED SUCCESSFULLY !!!")
else:
st.sidebar.write("NO DOCUMENT FOUND TO DELETE !!! PLEASE UPLOAD DOCUMENTS TO START PROCESS !! ")
if __name__ == "__main__":
main()