Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- app.py +99 -0
- function.py +123 -0
- htmlTemplate.py +89 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from function import scraping_pipeline
|
2 |
+
import os
|
3 |
+
import streamlit as st
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.vectorstores import FAISS
|
6 |
+
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
|
7 |
+
from langchain.chains import RetrievalQAWithSourcesChain
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import pickle
|
10 |
+
from htmlTemplate import css, bot_template, user_template
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
def data_pipeline(urls):
|
15 |
+
documents = scraping_pipeline(urls)
|
16 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
17 |
+
chunk_size = 500,
|
18 |
+
chunk_overlap = 50
|
19 |
+
)
|
20 |
+
chunks_text = text_splitter.split_documents(documents)
|
21 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
22 |
+
vector_stores = FAISS.from_documents(chunks_text, embeddings)
|
23 |
+
return vector_stores
|
24 |
+
|
25 |
+
|
26 |
+
def main():
|
27 |
+
|
28 |
+
st.set_page_config(
|
29 |
+
page_title= "News Website QnA using LLM",
|
30 |
+
page_icon= "📱",
|
31 |
+
layout="wide"
|
32 |
+
)
|
33 |
+
|
34 |
+
st.write(css, unsafe_allow_html=True)
|
35 |
+
|
36 |
+
st.title('News Website QnA using LLM 📰')
|
37 |
+
# process_links = False
|
38 |
+
file_name = "faiss_store_openai.pkl" # Provide a filename
|
39 |
+
file_path = os.path.join("vectordb", file_name) # Join the directory and filename
|
40 |
+
|
41 |
+
if not os.path.exists("vectordb"):
|
42 |
+
os.makedirs("vectordb")
|
43 |
+
|
44 |
+
llm = ChatGoogleGenerativeAI(model="gemini-pro")
|
45 |
+
|
46 |
+
with st.sidebar:
|
47 |
+
st.subheader("Input Indonesian News Article Link🔗")
|
48 |
+
num_link = st.number_input(
|
49 |
+
'How many links you want to input',
|
50 |
+
min_value= 0,
|
51 |
+
max_value= 5,
|
52 |
+
value = 1
|
53 |
+
)
|
54 |
+
urls = []
|
55 |
+
for i in range(1,num_link+1):
|
56 |
+
url = st.text_input(f"Indonesian News Article [CNN, Kompas, Detik] No {i}")
|
57 |
+
urls.append(url)
|
58 |
+
|
59 |
+
process_links = False
|
60 |
+
if "" not in urls:
|
61 |
+
process_links = st.button("Process URL")
|
62 |
+
|
63 |
+
if process_links:
|
64 |
+
with st.spinner("Processing..."):
|
65 |
+
vector_stores_gemini = data_pipeline(urls)
|
66 |
+
# Save the FAISS index to a pickle file
|
67 |
+
with open(file_path, "wb") as f:
|
68 |
+
pickle.dump(vector_stores_gemini, f)
|
69 |
+
st.success("Data has been process", icon="✅")
|
70 |
+
|
71 |
+
|
72 |
+
user_question = st.chat_input("Ask a question about your documents:")
|
73 |
+
|
74 |
+
if user_question:
|
75 |
+
st.write(user_template.replace("{{MSG}}",user_question), unsafe_allow_html= True)
|
76 |
+
if os.path.exists(file_path):
|
77 |
+
with open(file_path, 'rb') as f:
|
78 |
+
vector_stores = pickle.load(f)
|
79 |
+
|
80 |
+
chain = RetrievalQAWithSourcesChain.from_llm(
|
81 |
+
llm = llm,
|
82 |
+
retriever = vector_stores.as_retriever()
|
83 |
+
)
|
84 |
+
result = chain(
|
85 |
+
{"question": user_question},
|
86 |
+
return_only_outputs= True
|
87 |
+
)
|
88 |
+
# result will be a dictionary of this format --> {"answer": "", "sources": [] }
|
89 |
+
# Display sources, if available
|
90 |
+
sources = result.get("sources", "")
|
91 |
+
if sources:
|
92 |
+
response = f"{result['answer']} \n\nsource: {sources}"
|
93 |
+
st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
|
94 |
+
else:
|
95 |
+
response = result['answer']
|
96 |
+
st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
main()
|
function.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from langchain.document_loaders import UnstructuredURLLoader
|
4 |
+
from langchain_core.documents.base import Document
|
5 |
+
from urllib.parse import urlparse
|
6 |
+
|
7 |
+
# url = input("Insert Link That You Want to Scrape:")
|
8 |
+
|
9 |
+
def scrape_cnn(url):
|
10 |
+
response = requests.get(url)
|
11 |
+
# Check if the request was successful (status code 200)
|
12 |
+
if response.status_code == 200:
|
13 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
14 |
+
result = soup.find_all(class_="detail-wrap flex gap-4 relative")
|
15 |
+
|
16 |
+
# Clean up and concatenate the text using a for loop
|
17 |
+
cleaned_text_list = []
|
18 |
+
for element in result:
|
19 |
+
cleaned_text = element.get_text().replace('\n', '').strip()
|
20 |
+
cleaned_text_list.append(cleaned_text)
|
21 |
+
|
22 |
+
# Join the cleaned text from the list
|
23 |
+
all_text = " ".join(cleaned_text_list)
|
24 |
+
|
25 |
+
# # Print or use the cleaned and concatenated text
|
26 |
+
# print(all_text)
|
27 |
+
|
28 |
+
# # Write the result to a text file
|
29 |
+
# with open("result.txt", "w", encoding="utf-8") as f:
|
30 |
+
# f.write(all_text)
|
31 |
+
|
32 |
+
return all_text
|
33 |
+
else:
|
34 |
+
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
|
35 |
+
|
36 |
+
def scrape_kompas(url):
|
37 |
+
response = requests.get(url)
|
38 |
+
# Check if the request was successful (status code 200)
|
39 |
+
if response.status_code == 200:
|
40 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
+
scripts = soup.find_all('script')
|
42 |
+
for script in scripts:
|
43 |
+
script_text = script.get_text()
|
44 |
+
if "var keywordBrandSafety" in script_text:
|
45 |
+
result = script_text
|
46 |
+
result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";')
|
47 |
+
return result
|
48 |
+
else:
|
49 |
+
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
|
50 |
+
|
51 |
+
def scrape_detik(url):
|
52 |
+
response = requests.get(url)
|
53 |
+
if response.status_code == 200:
|
54 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
55 |
+
results = soup.find_all(class_='detail__body-text itp_bodycontent')
|
56 |
+
# Extract and return the text from each element
|
57 |
+
cleaned_text_list = []
|
58 |
+
for element in results:
|
59 |
+
text = element.get_text().replace('\n', '').strip()
|
60 |
+
cleaned_text_list.append(text)
|
61 |
+
|
62 |
+
# Join the cleaned text from the list
|
63 |
+
all_text = " ".join(cleaned_text_list)
|
64 |
+
|
65 |
+
return all_text
|
66 |
+
else:
|
67 |
+
print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
|
68 |
+
|
69 |
+
def document_instance(link, content):
|
70 |
+
document_instance = Document(
|
71 |
+
metadata= {'source':link},
|
72 |
+
page_content=content
|
73 |
+
)
|
74 |
+
return document_instance
|
75 |
+
|
76 |
+
def scrape_cnn_instance(url):
|
77 |
+
content = scrape_cnn(url)
|
78 |
+
return (document_instance(url, content))
|
79 |
+
|
80 |
+
def scrape_kompas_instance(url):
|
81 |
+
content = scrape_kompas(url)
|
82 |
+
return (document_instance(url, content))
|
83 |
+
|
84 |
+
def scrape_detik_instance(url):
|
85 |
+
content = scrape_detik(url)
|
86 |
+
return (document_instance(url, content))
|
87 |
+
|
88 |
+
def scraping_pipeline(links:list):
|
89 |
+
result = []
|
90 |
+
for link in links:
|
91 |
+
parsed_url = urlparse(link)
|
92 |
+
domain = parsed_url.netloc
|
93 |
+
|
94 |
+
# filter for detik links
|
95 |
+
if "detik.com" in domain:
|
96 |
+
result.append(scrape_detik_instance(link))
|
97 |
+
|
98 |
+
# filter for cnn
|
99 |
+
elif "cnnindonesia.com" in domain:
|
100 |
+
result.append(scrape_cnn_instance(link))
|
101 |
+
|
102 |
+
# filter for kompas
|
103 |
+
elif "kompas.com" in domain:
|
104 |
+
result.append(scrape_kompas_instance(link))
|
105 |
+
|
106 |
+
else:
|
107 |
+
print(f"Failed to retrieve the webpage. because your domain was {domain}")
|
108 |
+
return result
|
109 |
+
|
110 |
+
def langchain_url(url):
|
111 |
+
loader = UnstructuredURLLoader([url])
|
112 |
+
data = loader.load()
|
113 |
+
return data
|
114 |
+
|
115 |
+
|
116 |
+
links = [
|
117 |
+
'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as',
|
118 |
+
'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang',
|
119 |
+
'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn'
|
120 |
+
]
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
print(scraping_pipeline(links =links))
|
htmlTemplate.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Updated CSS
|
2 |
+
# CSS Styles
|
3 |
+
css = '''
|
4 |
+
<style>
|
5 |
+
/* Styling for the body of the Streamlit app */
|
6 |
+
body {
|
7 |
+
background-color: #f2f7ff; /* Soft blue background */
|
8 |
+
margin: 0; /* Remove default margin */
|
9 |
+
padding: 0; /* Remove default padding */
|
10 |
+
}
|
11 |
+
|
12 |
+
/* Styling for the chat container */
|
13 |
+
.chat-container {
|
14 |
+
max-width: 600px; /* Adjust the maximum width as needed */
|
15 |
+
margin: 0 auto; /* Center the chat container */
|
16 |
+
background-color: #ffffff; /* White background */
|
17 |
+
padding: 1rem; /* Add padding to the chat container */
|
18 |
+
border-radius: 1rem; /* Rounded corners for the chat container */
|
19 |
+
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); /* Add a subtle box shadow */
|
20 |
+
}
|
21 |
+
|
22 |
+
/* Styling for the chat messages */
|
23 |
+
.chat-message {
|
24 |
+
padding: 1rem;
|
25 |
+
border-radius: 0.5rem;
|
26 |
+
margin-bottom: 1rem;
|
27 |
+
display: flex;
|
28 |
+
border: 1px solid #d3d3d3; /* Add a subtle border */
|
29 |
+
}
|
30 |
+
|
31 |
+
/* Styling for user messages */
|
32 |
+
.chat-message.user {
|
33 |
+
background-color: #ffffff; /* White background for user messages */
|
34 |
+
}
|
35 |
+
|
36 |
+
/* Styling for bot messages */
|
37 |
+
.chat-message.bot {
|
38 |
+
background-color: #9dc8e5; /* Soft blue background for bot messages */
|
39 |
+
}
|
40 |
+
|
41 |
+
/* Styling for the avatar */
|
42 |
+
.chat-message .avatar {
|
43 |
+
width: 15%; /* Adjust avatar size */
|
44 |
+
}
|
45 |
+
|
46 |
+
/* Styling for the avatar image */
|
47 |
+
.chat-message .avatar img {
|
48 |
+
max-width: 60px;
|
49 |
+
max-height: 60px;
|
50 |
+
border-radius: 50%;
|
51 |
+
object-fit: cover;
|
52 |
+
}
|
53 |
+
|
54 |
+
/* Styling for the message content */
|
55 |
+
.chat-message .message {
|
56 |
+
flex: 1; /* Allow the message to take up remaining space */
|
57 |
+
padding: 0.75rem;
|
58 |
+
color: #495057; /* Dark text color for better readability */
|
59 |
+
}
|
60 |
+
|
61 |
+
/* Styling for strong (name) in the message */
|
62 |
+
.chat-message .message strong {
|
63 |
+
margin-right: 0.25rem; /* Adjust the margin as needed */
|
64 |
+
}
|
65 |
+
</style>
|
66 |
+
'''
|
67 |
+
|
68 |
+
# HTML Templates for Bot and User Messages
|
69 |
+
bot_template = '''
|
70 |
+
<div class="chat-message bot">
|
71 |
+
<div class="avatar">
|
72 |
+
<img src="https://i.ibb.co/dp2yyWP/bot.jpg">
|
73 |
+
</div>
|
74 |
+
<div class="message">
|
75 |
+
<strong>Doraemon:</strong> {{MSG}}
|
76 |
+
</div>
|
77 |
+
</div>
|
78 |
+
'''
|
79 |
+
|
80 |
+
user_template = '''
|
81 |
+
<div class="chat-message user">
|
82 |
+
<div class="avatar">
|
83 |
+
<img src="https://i.ibb.co/JB2sps1/human.jpg">
|
84 |
+
</div>
|
85 |
+
<div class="message">
|
86 |
+
<strong>Nobita:</strong> {{MSG}}
|
87 |
+
</div>
|
88 |
+
</div>
|
89 |
+
'''
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4
|
2 |
+
langchain
|
3 |
+
streamlit
|
4 |
+
langchain-google-genai
|
5 |
+
unstructured
|