hello-llm / app.py
Bandipreethamreddy's picture
Update app.py
7d158fe verified
raw
history blame contribute delete
No virus
6.13 kB
import os
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from langchain_community.llms import CTransformers
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
raise ValueError("Google API key not found. Please check your environment variables.")
genai.configure(api_key=google_api_key)
# Download stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
custom_stopwords = ["what", "is", "how", "who", "explain", "about", "?", "please", "hey", "whatsup", "can u explain"]
stop_words.extend(custom_stopwords)
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text() or ""
return text
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
return text_splitter.split_text(text)
def get_vector_store(text_chunks):
try:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
except Exception as e:
st.error(f"Error during embedding: {e}")
def get_conversational_chain():
prompt_template = """
Please provide a detailed answer based on the provided context. If the necessary information to answer the question is not present in the context, respond with 'The answer is not available in the context'
Context:
{context}
Question:
{question}
Answer:
"""
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
return load_qa_chain(model, chain_type="stuff", prompt=prompt)
def get_llama_response(input_text, no_words, blog_style, response_language):
llm = CTransformers(
model='llama-2-7b-chat.ggmlv3.q8_0.bin',
model_type='llama',
config={'max_new_tokens': 500, 'temperature': 0.01}
)
template = """
Given some information of '{input_text}', provide a concise summary suitable for a {blog_style} blog post in approximately {no_words} words. The total response should be in {response_language} language. Focus on key aspects and provide accurate information.
"""
prompt = PromptTemplate(input_variables=["blog_style", "input_text", 'no_words', 'response_language'],
template=template)
response = llm(prompt.format(input_text=input_text, no_words=no_words, blog_style=blog_style, response_language=response_language))
return response
def calculate_cosine_similarity(text, user_question):
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_matrix = vectorizer.fit_transform([text, user_question])
cos_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return cos_similarity
def translate_text(text, dest_language):
translator = Translator()
translation = translator.translate(text, dest=dest_language)
return translation.text
def user_input(user_question, raw_text):
try:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
gemini_chain = get_conversational_chain()
gemini_response = gemini_chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
initial_response = gemini_response["output_text"]
except Exception as e:
# st.error(f"Error during question answering: {e}")
initial_response = "The provided context does not contain any information"
similarity_score = calculate_cosine_similarity(raw_text, user_question)
st.write("Cosine similarity score: ", similarity_score)
if "The answer is not available in the context" in initial_response or "The provided context does not contain any information" in initial_response:
if similarity_score > 0.00125:
refined_response = get_llama_response(user_question, no_words=500, blog_style="detailed")
else:
refined_response = "I'm sorry, I cannot answer this question based on the provided context."
else:
refined_response = get_llama_response(initial_response, no_words=500, blog_style="detailed")
st.write("Generated Response:", refined_response )
def main():
st.set_page_config(page_title="Chat With AUTHOR", page_icon="πŸ“š", layout='centered')
st.header("Enhance Understanding with Gemini and LLaMA-2 models πŸ€–")
user_question = st.text_input("Ask a Question from the PDF Files uploaded")
with st.sidebar:
st.title("Menu:")
pdf_docs = st.file_uploader("Upload your PDF Files", accept_multiple_files=True)
if st.button("Submit & Process"):
with st.spinner("Processing..."):
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)
st.success("Done")
if user_question:
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)
user_input(user_question, raw_text, response_language)
if __name__ == "__main__":
main()