|
import streamlit as st |
|
from dotenv import load_dotenv |
|
from PyPDF2 import PdfReader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
|
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.chains import ConversationalRetrievalChain |
|
from htmlTemplates import css, bot_template, user_template |
|
from langchain.llms import HuggingFaceHub |
|
import os |
|
|
|
|
|
hub_token = os.environ["HUGGINGFACE_HUB_TOKEN"] |
|
|
|
def split_pdfs(pdf_docs): |
|
"""Splits a list of PDF documents into smaller chunks. |
|
|
|
Args: |
|
pdf_docs: A list of PDF documents. |
|
|
|
Returns: |
|
A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
|
""" |
|
|
|
pdf_chunks = [] |
|
|
|
pdf_reader = PdfReader(pdf_doc) |
|
pdf_pages = pdf_reader.pages |
|
|
|
|
|
pdf_chunks.append([]) |
|
for pdf_page in pdf_pages: |
|
|
|
pdf_chunks[-1].append(pdf_page) |
|
|
|
|
|
if len(pdf_chunks[-1]) >= 10: |
|
pdf_chunks.append([]) |
|
|
|
return pdf_chunks |
|
|
|
def generate_response(pdf_chunks, llm_model): |
|
"""Generates a response to a query using a list of PDF documents and an LLM model. |
|
|
|
Args: |
|
pdf_chunks: A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
|
llm_model: An LLM model. |
|
|
|
Returns: |
|
A response to the query. |
|
""" |
|
|
|
|
|
pdf_summaries = [] |
|
for pdf_chunk in pdf_chunks: |
|
|
|
pdf_summary = llm_model.generate( |
|
prompt=f"Summarize the following text:\n{get_pdf_text(pdf_chunk)}", |
|
max_new_tokens=100 |
|
) |
|
|
|
|
|
pdf_summaries.append(pdf_summary) |
|
|
|
|
|
response = llm_model.generate( |
|
prompt=f"Answer the following question using the following summaries:\n{get_text_chunks(pdf_summaries)}\n\nQuestion:", |
|
max_new_tokens=200 |
|
) |
|
|
|
return response |
|
|
|
def main(): |
|
load_dotenv() |
|
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") |
|
st.write(css, unsafe_allow_html=True) |
|
|
|
|
|
llm_model = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", huggingfacehub_api_token=hub_token, verbose=True) |
|
|
|
if "conversation" not in st.session_state: |
|
st.session_state.conversation = None |
|
|
|
if "chat_history" not in st.session_state: |
|
st.session_state.chat_history = None |
|
|
|
st.header("Chat with multiple PDFs :books:") |
|
user_question = st.text_input("Ask a question about your documents:") |
|
|
|
|
|
if user_question: |
|
|
|
pdf_chunks = split_pdfs("Geeta.pdf") |
|
|
|
|
|
response = generate_response(pdf_chunks, llm_model) |
|
|
|
st.write(response) |
|
|
|
|
|
|
|
main() |
|
|