import os import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI import google.generativeai as genai from langchain_community.vectorstores import FAISS from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate from dotenv import load_dotenv from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk from nltk.corpus import stopwords from langchain_community.llms import CTransformers # Load environment variables load_dotenv() google_api_key = os.getenv("GOOGLE_API_KEY") if not google_api_key: raise ValueError("Google API key not found. Please check your environment variables.") genai.configure(api_key=google_api_key) # Download stopwords nltk.download('stopwords') stop_words = stopwords.words('english') custom_stopwords = ["what", "is", "how", "who", "explain", "about", "?", "please", "hey", "whatsup", "can u explain"] stop_words.extend(custom_stopwords) def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() or "" return text def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) return text_splitter.split_text(text) def get_vector_store(text_chunks): try: embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) vector_store.save_local("faiss_index") except Exception as e: st.error(f"Error during embedding: {e}") def get_conversational_chain(): prompt_template = """ Please provide a detailed answer based on the provided context. If the necessary information to answer the question is not present in the context, respond with 'The answer is not available in the context' Context: {context} Question: {question} Answer: """ model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3) prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) return load_qa_chain(model, chain_type="stuff", prompt=prompt) def get_llama_response(input_text, no_words, blog_style, response_language): llm = CTransformers( model='llama-2-7b-chat.ggmlv3.q8_0.bin', model_type='llama', config={'max_new_tokens': 500, 'temperature': 0.01} ) template = """ Given some information of '{input_text}', provide a concise summary suitable for a {blog_style} blog post in approximately {no_words} words. The total response should be in {response_language} language. Focus on key aspects and provide accurate information. """ prompt = PromptTemplate(input_variables=["blog_style", "input_text", 'no_words', 'response_language'], template=template) response = llm(prompt.format(input_text=input_text, no_words=no_words, blog_style=blog_style, response_language=response_language)) return response def calculate_cosine_similarity(text, user_question): vectorizer = TfidfVectorizer(stop_words=list(stop_words)) tfidf_matrix = vectorizer.fit_transform([text, user_question]) cos_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] return cos_similarity def translate_text(text, dest_language): translator = Translator() translation = translator.translate(text, dest=dest_language) return translation.text def user_input(user_question, raw_text): try: embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) docs = new_db.similarity_search(user_question) gemini_chain = get_conversational_chain() gemini_response = gemini_chain({"input_documents": docs, "question": user_question}, return_only_outputs=True) initial_response = gemini_response["output_text"] except Exception as e: # st.error(f"Error during question answering: {e}") initial_response = "The provided context does not contain any information" similarity_score = calculate_cosine_similarity(raw_text, user_question) st.write("Cosine similarity score: ", similarity_score) if "The answer is not available in the context" in initial_response or "The provided context does not contain any information" in initial_response: if similarity_score > 0.00125: refined_response = get_llama_response(user_question, no_words=500, blog_style="detailed") else: refined_response = "I'm sorry, I cannot answer this question based on the provided context." else: refined_response = get_llama_response(initial_response, no_words=500, blog_style="detailed") st.write("Generated Response:", refined_response ) def main(): st.set_page_config(page_title="Chat With AUTHOR", page_icon="📚", layout='centered') st.header("Enhance Understanding with Gemini and LLaMA-2 models 🤖") user_question = st.text_input("Ask a Question from the PDF Files uploaded") with st.sidebar: st.title("Menu:") pdf_docs = st.file_uploader("Upload your PDF Files", accept_multiple_files=True) if st.button("Submit & Process"): with st.spinner("Processing..."): raw_text = get_pdf_text(pdf_docs) text_chunks = get_text_chunks(raw_text) get_vector_store(text_chunks) st.success("Done") if user_question: raw_text = get_pdf_text(pdf_docs) text_chunks = get_text_chunks(raw_text) get_vector_store(text_chunks) user_input(user_question, raw_text, response_language) if __name__ == "__main__": main()