Spaces:

LuckRafly
/

QnA-Indonesia-Article-GeminiAI

Runtime error

App Files Files Community

LuckRafly commited on Dec 21, 2023

Commit

df7271b

1 Parent(s): 0da1eaf

Upload 4 files

Browse files

Files changed (4) hide show

app.py +99 -0
function.py +123 -0
htmlTemplate.py +89 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from function import scraping_pipeline
+import os
+import streamlit as st
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
+from langchain.chains import RetrievalQAWithSourcesChain
+from dotenv import load_dotenv
+import pickle
+from htmlTemplate import css, bot_template, user_template
+load_dotenv()
+def data_pipeline(urls):
+    documents = scraping_pipeline(urls)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size = 500,
+        chunk_overlap = 50
+    )
+    chunks_text = text_splitter.split_documents(documents)
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_stores = FAISS.from_documents(chunks_text, embeddings)
+    return vector_stores
+def main():
+    st.set_page_config(
+        page_title= "News Website QnA using LLM",
+        page_icon= "📱",
+        layout="wide"
+    )
+    st.write(css, unsafe_allow_html=True)
+    st.title('News Website QnA using LLM 📰')
+    # process_links = False
+    file_name = "faiss_store_openai.pkl"  # Provide a filename
+    file_path = os.path.join("vectordb", file_name)  # Join the directory and filename
+    if not os.path.exists("vectordb"):
+        os.makedirs("vectordb")
+    llm = ChatGoogleGenerativeAI(model="gemini-pro")
+    with st.sidebar:
+        st.subheader("Input Indonesian News Article Link🔗")
+        num_link = st.number_input(
+            'How many links you want to input',
+            min_value= 0,
+            max_value= 5,
+            value = 1
+        )
+        urls = []
+        for i in range(1,num_link+1):
+            url = st.text_input(f"Indonesian News Article [CNN, Kompas, Detik] No {i}")
+            urls.append(url)
+        process_links = False
+        if "" not in urls:
+            process_links = st.button("Process URL")
+    if process_links:
+        with st.spinner("Processing..."):
+            vector_stores_gemini = data_pipeline(urls)
+            # Save the FAISS index to a pickle file
+            with open(file_path, "wb") as f:
+                pickle.dump(vector_stores_gemini, f)
+            st.success("Data has been process", icon="✅")
+    user_question = st.chat_input("Ask a question about your documents:")
+    if user_question:
+        st.write(user_template.replace("{{MSG}}",user_question), unsafe_allow_html= True)
+        if os.path.exists(file_path):
+            with open(file_path, 'rb') as f:
+                vector_stores = pickle.load(f)
+        chain = RetrievalQAWithSourcesChain.from_llm(
+            llm = llm,
+            retriever = vector_stores.as_retriever()
+        )
+        result = chain(
+            {"question": user_question},
+            return_only_outputs= True
+        )
+        # result will be a dictionary of this format --> {"answer": "", "sources": [] }
+        # Display sources, if available
+        sources = result.get("sources", "")
+        if sources:
+            response = f"{result['answer']} \n\nsource: {sources}"
+            st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
+        else:
+            response = result['answer']
+            st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
+if __name__ == '__main__':
+    main()

function.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import requests
+from bs4 import BeautifulSoup
+from langchain.document_loaders import UnstructuredURLLoader
+from langchain_core.documents.base import Document
+from urllib.parse import urlparse
+# url = input("Insert Link That You Want to Scrape:")
+def scrape_cnn(url):
+    response = requests.get(url)
+    # Check if the request was successful (status code 200)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        result = soup.find_all(class_="detail-wrap flex gap-4 relative")
+        # Clean up and concatenate the text using a for loop
+        cleaned_text_list = []
+        for element in result:
+            cleaned_text = element.get_text().replace('\n', '').strip()
+            cleaned_text_list.append(cleaned_text)
+        # Join the cleaned text from the list
+        all_text = " ".join(cleaned_text_list)
+        # # Print or use the cleaned and concatenated text
+        # print(all_text)
+        # # Write the result to a text file
+        # with open("result.txt", "w", encoding="utf-8") as f:
+        #     f.write(all_text)
+        return all_text
+    else:
+        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
+def scrape_kompas(url):
+    response = requests.get(url)
+    # Check if the request was successful (status code 200)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        scripts = soup.find_all('script')
+        for script in scripts:
+            script_text = script.get_text()
+            if "var keywordBrandSafety" in script_text:
+                result = script_text
+        result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";')
+        return result
+    else:
+        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
+def scrape_detik(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        results = soup.find_all(class_='detail__body-text itp_bodycontent')
+        # Extract and return the text from each element
+        cleaned_text_list = []
+        for element in results:
+            text = element.get_text().replace('\n', '').strip()
+            cleaned_text_list.append(text)
+        # Join the cleaned text from the list
+        all_text = " ".join(cleaned_text_list)
+        return all_text
+    else:
+        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
+def document_instance(link, content):
+    document_instance = Document(
+        metadata= {'source':link},
+        page_content=content
+    )
+    return document_instance
+def scrape_cnn_instance(url):
+    content = scrape_cnn(url)
+    return (document_instance(url, content))
+def scrape_kompas_instance(url):
+    content = scrape_kompas(url)
+    return (document_instance(url, content))
+def scrape_detik_instance(url):
+    content = scrape_detik(url)
+    return (document_instance(url, content))
+def scraping_pipeline(links:list):
+    result = []
+    for link in links:
+        parsed_url = urlparse(link)
+        domain = parsed_url.netloc
+        # filter for detik links
+        if "detik.com" in domain:
+            result.append(scrape_detik_instance(link))
+        # filter for cnn
+        elif "cnnindonesia.com" in domain:
+            result.append(scrape_cnn_instance(link))
+        # filter for kompas
+        elif "kompas.com" in domain:
+            result.append(scrape_kompas_instance(link))
+        else:
+            print(f"Failed to retrieve the webpage. because your domain was {domain}")
+    return result
+def langchain_url(url):
+    loader = UnstructuredURLLoader([url])
+    data = loader.load()
+    return data
+links = [
+    'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as',
+    'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang',
+    'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn'
+]
+if __name__ == "__main__":
+    print(scraping_pipeline(links =links))

htmlTemplate.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Updated CSS
+# CSS Styles
+css = '''
+<style>
+    /* Styling for the body of the Streamlit app */
+    body {
+        background-color: #f2f7ff; /* Soft blue background */
+        margin: 0; /* Remove default margin */
+        padding: 0; /* Remove default padding */
+    }
+    /* Styling for the chat container */
+    .chat-container {
+        max-width: 600px; /* Adjust the maximum width as needed */
+        margin: 0 auto; /* Center the chat container */
+        background-color: #ffffff; /* White background */
+        padding: 1rem; /* Add padding to the chat container */
+        border-radius: 1rem; /* Rounded corners for the chat container */
+        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); /* Add a subtle box shadow */
+    }
+    /* Styling for the chat messages */
+    .chat-message {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin-bottom: 1rem;
+        display: flex;
+        border: 1px solid #d3d3d3; /* Add a subtle border */
+    }
+    /* Styling for user messages */
+    .chat-message.user {
+        background-color: #ffffff; /* White background for user messages */
+    }
+    /* Styling for bot messages */
+    .chat-message.bot {
+        background-color: #9dc8e5; /* Soft blue background for bot messages */
+    }
+    /* Styling for the avatar */
+    .chat-message .avatar {
+        width: 15%; /* Adjust avatar size */
+    }
+    /* Styling for the avatar image */
+    .chat-message .avatar img {
+        max-width: 60px;
+        max-height: 60px;
+        border-radius: 50%;
+        object-fit: cover;
+    }
+    /* Styling for the message content */
+    .chat-message .message {
+        flex: 1; /* Allow the message to take up remaining space */
+        padding: 0.75rem;
+        color: #495057; /* Dark text color for better readability */
+    }
+    /* Styling for strong (name) in the message */
+    .chat-message .message strong {
+        margin-right: 0.25rem; /* Adjust the margin as needed */
+    }
+</style>
+'''
+# HTML Templates for Bot and User Messages
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="https://i.ibb.co/dp2yyWP/bot.jpg">
+    </div>
+    <div class="message">
+        <strong>Doraemon:</strong> {{MSG}}
+    </div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="https://i.ibb.co/JB2sps1/human.jpg">
+    </div>
+    <div class="message">
+        <strong>Nobita:</strong> {{MSG}}
+    </div>
+</div>
+'''

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+beautifulsoup4
+langchain
+streamlit
+langchain-google-genai
+unstructured