LuckRafly commited on
Commit
df7271b
·
1 Parent(s): 0da1eaf

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +99 -0
  2. function.py +123 -0
  3. htmlTemplate.py +89 -0
  4. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from function import scraping_pipeline
2
+ import os
3
+ import streamlit as st
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
7
+ from langchain.chains import RetrievalQAWithSourcesChain
8
+ from dotenv import load_dotenv
9
+ import pickle
10
+ from htmlTemplate import css, bot_template, user_template
11
+
12
+ load_dotenv()
13
+
14
+ def data_pipeline(urls):
15
+ documents = scraping_pipeline(urls)
16
+ text_splitter = RecursiveCharacterTextSplitter(
17
+ chunk_size = 500,
18
+ chunk_overlap = 50
19
+ )
20
+ chunks_text = text_splitter.split_documents(documents)
21
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
22
+ vector_stores = FAISS.from_documents(chunks_text, embeddings)
23
+ return vector_stores
24
+
25
+
26
+ def main():
27
+
28
+ st.set_page_config(
29
+ page_title= "News Website QnA using LLM",
30
+ page_icon= "📱",
31
+ layout="wide"
32
+ )
33
+
34
+ st.write(css, unsafe_allow_html=True)
35
+
36
+ st.title('News Website QnA using LLM 📰')
37
+ # process_links = False
38
+ file_name = "faiss_store_openai.pkl" # Provide a filename
39
+ file_path = os.path.join("vectordb", file_name) # Join the directory and filename
40
+
41
+ if not os.path.exists("vectordb"):
42
+ os.makedirs("vectordb")
43
+
44
+ llm = ChatGoogleGenerativeAI(model="gemini-pro")
45
+
46
+ with st.sidebar:
47
+ st.subheader("Input Indonesian News Article Link🔗")
48
+ num_link = st.number_input(
49
+ 'How many links you want to input',
50
+ min_value= 0,
51
+ max_value= 5,
52
+ value = 1
53
+ )
54
+ urls = []
55
+ for i in range(1,num_link+1):
56
+ url = st.text_input(f"Indonesian News Article [CNN, Kompas, Detik] No {i}")
57
+ urls.append(url)
58
+
59
+ process_links = False
60
+ if "" not in urls:
61
+ process_links = st.button("Process URL")
62
+
63
+ if process_links:
64
+ with st.spinner("Processing..."):
65
+ vector_stores_gemini = data_pipeline(urls)
66
+ # Save the FAISS index to a pickle file
67
+ with open(file_path, "wb") as f:
68
+ pickle.dump(vector_stores_gemini, f)
69
+ st.success("Data has been process", icon="✅")
70
+
71
+
72
+ user_question = st.chat_input("Ask a question about your documents:")
73
+
74
+ if user_question:
75
+ st.write(user_template.replace("{{MSG}}",user_question), unsafe_allow_html= True)
76
+ if os.path.exists(file_path):
77
+ with open(file_path, 'rb') as f:
78
+ vector_stores = pickle.load(f)
79
+
80
+ chain = RetrievalQAWithSourcesChain.from_llm(
81
+ llm = llm,
82
+ retriever = vector_stores.as_retriever()
83
+ )
84
+ result = chain(
85
+ {"question": user_question},
86
+ return_only_outputs= True
87
+ )
88
+ # result will be a dictionary of this format --> {"answer": "", "sources": [] }
89
+ # Display sources, if available
90
+ sources = result.get("sources", "")
91
+ if sources:
92
+ response = f"{result['answer']} \n\nsource: {sources}"
93
+ st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
94
+ else:
95
+ response = result['answer']
96
+ st.write(bot_template.replace("{{MSG}}",response), unsafe_allow_html= True)
97
+
98
+ if __name__ == '__main__':
99
+ main()
function.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from langchain.document_loaders import UnstructuredURLLoader
4
+ from langchain_core.documents.base import Document
5
+ from urllib.parse import urlparse
6
+
7
+ # url = input("Insert Link That You Want to Scrape:")
8
+
9
+ def scrape_cnn(url):
10
+ response = requests.get(url)
11
+ # Check if the request was successful (status code 200)
12
+ if response.status_code == 200:
13
+ soup = BeautifulSoup(response.text, 'html.parser')
14
+ result = soup.find_all(class_="detail-wrap flex gap-4 relative")
15
+
16
+ # Clean up and concatenate the text using a for loop
17
+ cleaned_text_list = []
18
+ for element in result:
19
+ cleaned_text = element.get_text().replace('\n', '').strip()
20
+ cleaned_text_list.append(cleaned_text)
21
+
22
+ # Join the cleaned text from the list
23
+ all_text = " ".join(cleaned_text_list)
24
+
25
+ # # Print or use the cleaned and concatenated text
26
+ # print(all_text)
27
+
28
+ # # Write the result to a text file
29
+ # with open("result.txt", "w", encoding="utf-8") as f:
30
+ # f.write(all_text)
31
+
32
+ return all_text
33
+ else:
34
+ print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
35
+
36
+ def scrape_kompas(url):
37
+ response = requests.get(url)
38
+ # Check if the request was successful (status code 200)
39
+ if response.status_code == 200:
40
+ soup = BeautifulSoup(response.text, 'html.parser')
41
+ scripts = soup.find_all('script')
42
+ for script in scripts:
43
+ script_text = script.get_text()
44
+ if "var keywordBrandSafety" in script_text:
45
+ result = script_text
46
+ result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";')
47
+ return result
48
+ else:
49
+ print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
50
+
51
+ def scrape_detik(url):
52
+ response = requests.get(url)
53
+ if response.status_code == 200:
54
+ soup = BeautifulSoup(response.text, 'html.parser')
55
+ results = soup.find_all(class_='detail__body-text itp_bodycontent')
56
+ # Extract and return the text from each element
57
+ cleaned_text_list = []
58
+ for element in results:
59
+ text = element.get_text().replace('\n', '').strip()
60
+ cleaned_text_list.append(text)
61
+
62
+ # Join the cleaned text from the list
63
+ all_text = " ".join(cleaned_text_list)
64
+
65
+ return all_text
66
+ else:
67
+ print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
68
+
69
+ def document_instance(link, content):
70
+ document_instance = Document(
71
+ metadata= {'source':link},
72
+ page_content=content
73
+ )
74
+ return document_instance
75
+
76
+ def scrape_cnn_instance(url):
77
+ content = scrape_cnn(url)
78
+ return (document_instance(url, content))
79
+
80
+ def scrape_kompas_instance(url):
81
+ content = scrape_kompas(url)
82
+ return (document_instance(url, content))
83
+
84
+ def scrape_detik_instance(url):
85
+ content = scrape_detik(url)
86
+ return (document_instance(url, content))
87
+
88
+ def scraping_pipeline(links:list):
89
+ result = []
90
+ for link in links:
91
+ parsed_url = urlparse(link)
92
+ domain = parsed_url.netloc
93
+
94
+ # filter for detik links
95
+ if "detik.com" in domain:
96
+ result.append(scrape_detik_instance(link))
97
+
98
+ # filter for cnn
99
+ elif "cnnindonesia.com" in domain:
100
+ result.append(scrape_cnn_instance(link))
101
+
102
+ # filter for kompas
103
+ elif "kompas.com" in domain:
104
+ result.append(scrape_kompas_instance(link))
105
+
106
+ else:
107
+ print(f"Failed to retrieve the webpage. because your domain was {domain}")
108
+ return result
109
+
110
+ def langchain_url(url):
111
+ loader = UnstructuredURLLoader([url])
112
+ data = loader.load()
113
+ return data
114
+
115
+
116
+ links = [
117
+ 'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as',
118
+ 'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang',
119
+ 'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn'
120
+ ]
121
+
122
+ if __name__ == "__main__":
123
+ print(scraping_pipeline(links =links))
htmlTemplate.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Updated CSS
2
+ # CSS Styles
3
+ css = '''
4
+ <style>
5
+ /* Styling for the body of the Streamlit app */
6
+ body {
7
+ background-color: #f2f7ff; /* Soft blue background */
8
+ margin: 0; /* Remove default margin */
9
+ padding: 0; /* Remove default padding */
10
+ }
11
+
12
+ /* Styling for the chat container */
13
+ .chat-container {
14
+ max-width: 600px; /* Adjust the maximum width as needed */
15
+ margin: 0 auto; /* Center the chat container */
16
+ background-color: #ffffff; /* White background */
17
+ padding: 1rem; /* Add padding to the chat container */
18
+ border-radius: 1rem; /* Rounded corners for the chat container */
19
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); /* Add a subtle box shadow */
20
+ }
21
+
22
+ /* Styling for the chat messages */
23
+ .chat-message {
24
+ padding: 1rem;
25
+ border-radius: 0.5rem;
26
+ margin-bottom: 1rem;
27
+ display: flex;
28
+ border: 1px solid #d3d3d3; /* Add a subtle border */
29
+ }
30
+
31
+ /* Styling for user messages */
32
+ .chat-message.user {
33
+ background-color: #ffffff; /* White background for user messages */
34
+ }
35
+
36
+ /* Styling for bot messages */
37
+ .chat-message.bot {
38
+ background-color: #9dc8e5; /* Soft blue background for bot messages */
39
+ }
40
+
41
+ /* Styling for the avatar */
42
+ .chat-message .avatar {
43
+ width: 15%; /* Adjust avatar size */
44
+ }
45
+
46
+ /* Styling for the avatar image */
47
+ .chat-message .avatar img {
48
+ max-width: 60px;
49
+ max-height: 60px;
50
+ border-radius: 50%;
51
+ object-fit: cover;
52
+ }
53
+
54
+ /* Styling for the message content */
55
+ .chat-message .message {
56
+ flex: 1; /* Allow the message to take up remaining space */
57
+ padding: 0.75rem;
58
+ color: #495057; /* Dark text color for better readability */
59
+ }
60
+
61
+ /* Styling for strong (name) in the message */
62
+ .chat-message .message strong {
63
+ margin-right: 0.25rem; /* Adjust the margin as needed */
64
+ }
65
+ </style>
66
+ '''
67
+
68
+ # HTML Templates for Bot and User Messages
69
+ bot_template = '''
70
+ <div class="chat-message bot">
71
+ <div class="avatar">
72
+ <img src="https://i.ibb.co/dp2yyWP/bot.jpg">
73
+ </div>
74
+ <div class="message">
75
+ <strong>Doraemon:</strong> {{MSG}}
76
+ </div>
77
+ </div>
78
+ '''
79
+
80
+ user_template = '''
81
+ <div class="chat-message user">
82
+ <div class="avatar">
83
+ <img src="https://i.ibb.co/JB2sps1/human.jpg">
84
+ </div>
85
+ <div class="message">
86
+ <strong>Nobita:</strong> {{MSG}}
87
+ </div>
88
+ </div>
89
+ '''
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ langchain
3
+ streamlit
4
+ langchain-google-genai
5
+ unstructured