from function import scraping_pipeline import os import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings from langchain.chains import RetrievalQAWithSourcesChain from dotenv import load_dotenv import pickle from htmlTemplate import css, bot_template, user_template load_dotenv() def data_pipeline(urls): documents = scraping_pipeline(urls) text_splitter = RecursiveCharacterTextSplitter( chunk_size = 500, chunk_overlap = 50 ) chunks_text = text_splitter.split_documents(documents) embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") vector_stores = FAISS.from_documents(chunks_text, embeddings) return vector_stores def main(): st.set_page_config( page_title= "News Website QnA using LLM", page_icon= "πŸ“±", layout="wide" ) st.write(css, unsafe_allow_html=True) st.title('News Website QnA using LLM πŸ“°') # process_links = False file_name = "faiss_store_openai.pkl" # Provide a filename file_path = os.path.join("vectordb", file_name) # Join the directory and filename if not os.path.exists("vectordb"): os.makedirs("vectordb") llm = ChatGoogleGenerativeAI(model="gemini-pro") with st.sidebar: st.subheader("Input Indonesian News Article LinkπŸ”—") num_link = st.number_input( 'How many links you want to input', min_value= 0, max_value= 5, value = 1 ) urls = [] for i in range(1,num_link+1): url = st.text_input(f"Indonesian News Article [CNN, Kompas, Detik] No {i}") urls.append(url) process_links = False if "" not in urls: process_links = st.button("Process URL") if process_links: with st.spinner("Processing..."): vector_stores_gemini = data_pipeline(urls) # Save the FAISS index to a pickle file with open(file_path, "wb") as f: pickle.dump(vector_stores_gemini, f) st.success("Data has been process", icon="βœ…") user_question = st.chat_input("Ask a question about your documents:") if user_question: st.write(user_template.replace("{{MSG}}",user_question), unsafe_allow_html= True) if os.path.exists(file_path): with open(file_path, 'rb') as f: vector_stores = pickle.load(f) chain = RetrievalQAWithSourcesChain.from_llm( llm = llm, retriever = vector_stores.as_retriever() ) result = chain( {"question": user_question}, return_only_outputs= True ) # result will be a dictionary of this format --> {"answer": "", "sources": [] } # Display sources, if available sources = result.get("sources", "") if sources: response = f"{result['answer']} \n\nsource: {sources}" st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True) else: response = result['answer'] st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True) if __name__ == '__main__': main()