Update pages/News_Scrapping.py
Browse files- pages/News_Scrapping.py +203 -99
pages/News_Scrapping.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from newspaper import Article, Config
|
@@ -8,6 +10,9 @@ import torch
|
|
8 |
import requests
|
9 |
import logging
|
10 |
from gnews import GNews
|
|
|
|
|
|
|
11 |
import nltk
|
12 |
nltk.download('punkt')
|
13 |
|
@@ -15,153 +20,252 @@ nltk.download('punkt')
|
|
15 |
# Set page configuration (Call this once and make changes as needed)
|
16 |
st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
## ............................................... ##
|
19 |
# Set up logging
|
20 |
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
21 |
|
22 |
## ............................................... ##
|
23 |
-
# Function for
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
# Function for sentiment analysis
|
34 |
-
|
|
|
35 |
try:
|
36 |
-
tokens_info =
|
37 |
with torch.no_grad():
|
38 |
-
raw_predictions =
|
39 |
|
40 |
predicted_class_id = raw_predictions.argmax().item()
|
41 |
-
predict =
|
42 |
|
43 |
softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
|
44 |
-
|
45 |
if (softmaxed > 70):
|
46 |
status = 'Not trust'
|
47 |
elif (softmaxed > 40):
|
48 |
status = 'Not sure'
|
49 |
else:
|
50 |
status = 'Trust'
|
51 |
-
|
52 |
return status, predict
|
|
|
53 |
except Exception as e:
|
54 |
logging.error(f"Sentiment analysis error: {str(e)}")
|
55 |
return 'N/A', 'N/A'
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
65 |
# Function to process an article
|
66 |
-
|
|
|
67 |
try:
|
68 |
-
article = Article(url=url, config=
|
69 |
article.download()
|
70 |
article.parse()
|
71 |
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
publish_date = article.publish_date
|
77 |
|
78 |
-
# Check if
|
79 |
-
if
|
80 |
-
|
81 |
-
else:
|
82 |
-
publish_date = 'N/A'
|
83 |
|
|
|
84 |
text = article.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
article.nlp()
|
86 |
-
keywords = article.
|
87 |
summary = article.summary
|
88 |
|
89 |
-
|
90 |
-
|
|
|
91 |
|
92 |
-
return publish_date, language, url, title, authors, keywords, summary, text
|
93 |
except Exception as e:
|
94 |
logging.error(f"Article processing error: {str(e)}")
|
95 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
## ............................................... ##
|
98 |
-
|
99 |
-
|
100 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
## ............................................... ##
|
103 |
-
|
104 |
-
|
105 |
-
max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
|
106 |
-
country = st.text_input('Country:', 'Indonesia')
|
107 |
-
language = st.text_input('Language:', 'indonesian')
|
108 |
-
start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# Create a variable to track whether the data has been processed
|
111 |
data_processed = False
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Initialize the DataFrame
|
114 |
-
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', '
|
115 |
|
116 |
# Initialize your model and tokenizer
|
117 |
-
|
118 |
-
|
119 |
-
#tl = translate_text(concated_text, source=language, target='en')
|
120 |
-
#status, predict = predict_sentiment(tl, model, tokenizer)
|
121 |
-
|
122 |
-
# Fetch news and process articles
|
123 |
-
if st.button('Fetch and Process News'):
|
124 |
-
|
125 |
-
# Your news retrieval code
|
126 |
-
google_news = GNews()
|
127 |
-
|
128 |
-
google_news.max_results = max_results
|
129 |
-
google_news.country = country
|
130 |
-
google_news.language = language
|
131 |
-
google_news.start_date = (start_date.year, start_date.month, start_date.day)
|
132 |
-
|
133 |
-
news = google_news.get_news(search_term)
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from streamlit_pandas_profiling import st_profile_report
|
2 |
+
from ydata_profiling import ProfileReport
|
3 |
import streamlit as st
|
4 |
import pandas as pd
|
5 |
from newspaper import Article, Config
|
|
|
10 |
import requests
|
11 |
import logging
|
12 |
from gnews import GNews
|
13 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
14 |
+
from textblob import TextBlob
|
15 |
+
from urllib.parse import urlparse
|
16 |
import nltk
|
17 |
nltk.download('punkt')
|
18 |
|
|
|
20 |
# Set page configuration (Call this once and make changes as needed)
|
21 |
st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
|
22 |
|
23 |
+
with st.container():
|
24 |
+
# Initialize Streamlit app
|
25 |
+
st.title('News Article Scrapping')
|
26 |
+
st.write("Created by Bayhaqy")
|
27 |
+
|
28 |
## ............................................... ##
|
29 |
# Set up logging
|
30 |
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
31 |
|
32 |
## ............................................... ##
|
33 |
+
# Function for get model and tokenize
|
34 |
+
@st.cache_resource
|
35 |
+
def get_models_and_tokenizers():
|
36 |
+
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
38 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
39 |
+
#model.eval()
|
40 |
+
|
41 |
+
return model, tokenizer
|
42 |
|
43 |
# Function for sentiment analysis
|
44 |
+
@st.cache_resource
|
45 |
+
def analyze_sentiment_distilbert(text, _model, _tokenizer):
|
46 |
try:
|
47 |
+
tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
|
48 |
with torch.no_grad():
|
49 |
+
raw_predictions = _model(**tokens_info).logits
|
50 |
|
51 |
predicted_class_id = raw_predictions.argmax().item()
|
52 |
+
predict = _model.config.id2label[predicted_class_id]
|
53 |
|
54 |
softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
|
|
|
55 |
if (softmaxed > 70):
|
56 |
status = 'Not trust'
|
57 |
elif (softmaxed > 40):
|
58 |
status = 'Not sure'
|
59 |
else:
|
60 |
status = 'Trust'
|
|
|
61 |
return status, predict
|
62 |
+
|
63 |
except Exception as e:
|
64 |
logging.error(f"Sentiment analysis error: {str(e)}")
|
65 |
return 'N/A', 'N/A'
|
66 |
|
67 |
+
# Function for sentiment analysis using VADER
|
68 |
+
@st.cache_data
|
69 |
+
def analyze_sentiment_vader(text):
|
70 |
+
analyzer = SentimentIntensityAnalyzer()
|
71 |
+
sentiment = analyzer.polarity_scores(text)
|
72 |
+
compound_score = sentiment['compound']
|
73 |
+
if compound_score >= 0.05:
|
74 |
+
return 'Positive'
|
75 |
+
elif compound_score <= -0.05:
|
76 |
+
return 'Negative'
|
77 |
+
else:
|
78 |
+
return 'Neutral'
|
79 |
+
|
80 |
+
# Function for sentiment analysis using TextBlob
|
81 |
+
@st.cache_data
|
82 |
+
def analyze_sentiment_textblob(text):
|
83 |
+
analysis = TextBlob(text)
|
84 |
+
polarity = analysis.sentiment.polarity
|
85 |
+
if polarity > 0:
|
86 |
+
return 'Positive'
|
87 |
+
elif polarity < 0:
|
88 |
+
return 'Negative'
|
89 |
+
else:
|
90 |
+
return 'Neutral'
|
91 |
|
92 |
+
## ............................................... ##
|
93 |
# Function to process an article
|
94 |
+
@st.cache_data
|
95 |
+
def process_article(url, _config):
|
96 |
try:
|
97 |
+
article = Article(url=url, config=_config)
|
98 |
article.download()
|
99 |
article.parse()
|
100 |
|
101 |
+
# Check if publish_date is not None before further processing
|
102 |
+
if article.publish_date is None:
|
103 |
+
return None # Skip processing and return None
|
|
|
|
|
104 |
|
105 |
+
# Check if text is not None before further processing
|
106 |
+
if len(article.text) <= 5:
|
107 |
+
return None # Skip processing and return None
|
|
|
|
|
108 |
|
109 |
+
# Get the article data if publish_date is not not None
|
110 |
text = article.text
|
111 |
+
url = article.canonical_link
|
112 |
+
source_url = urlparse(url).netloc
|
113 |
+
|
114 |
+
title = article.title
|
115 |
+
authors = article.authors
|
116 |
+
#publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
|
117 |
+
publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')
|
118 |
+
|
119 |
article.nlp()
|
120 |
+
keywords = article.meta_keywords
|
121 |
summary = article.summary
|
122 |
|
123 |
+
language = detect(title)
|
124 |
+
|
125 |
+
return publish_date, language, url, source_url, title, authors, keywords, text, summary
|
126 |
|
|
|
127 |
except Exception as e:
|
128 |
logging.error(f"Article processing error: {str(e)}")
|
129 |
+
return None # Skip processing and return None
|
130 |
+
|
131 |
+
# Function for translation
|
132 |
+
@st.cache_data
|
133 |
+
def translate_text(text, source='auto', target='en'):
|
134 |
+
try:
|
135 |
+
if source != target:
|
136 |
+
text = GoogleTranslator(source=source, target=target).translate(text)
|
137 |
+
return text
|
138 |
+
|
139 |
+
except Exception as e:
|
140 |
+
logging.error(f"Translation error: {str(e)}")
|
141 |
+
return text
|
142 |
|
143 |
## ............................................... ##
|
144 |
+
with st.container():
|
145 |
+
# Input search parameters
|
146 |
+
search_term = st.text_input('Enter a search term :', 'Indonesia')
|
147 |
+
|
148 |
+
col1, col2, col3 = st.columns(3)
|
149 |
+
|
150 |
+
with col1:
|
151 |
+
period = st.text_input('Enter a news period :', '7d')
|
152 |
+
max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
|
153 |
+
with col2:
|
154 |
+
country = st.text_input('Country :', 'Indonesia')
|
155 |
+
language = st.text_input('Language :', 'indonesian')
|
156 |
+
with col3:
|
157 |
+
start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
|
158 |
+
end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
|
159 |
|
160 |
## ............................................... ##
|
161 |
+
with st.container():
|
162 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
with col1:
|
165 |
+
# Checkbox options for different processing steps
|
166 |
+
include_translation = st.checkbox("Include Translation", value=False)
|
167 |
+
include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
|
168 |
+
with col2:
|
169 |
+
include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
|
170 |
+
include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
|
171 |
+
|
172 |
+
## ............................................... ##
|
173 |
# Create a variable to track whether the data has been processed
|
174 |
data_processed = False
|
175 |
|
176 |
+
## ............................................... ##
|
177 |
+
# Create a custom configuration
|
178 |
+
config = Config()
|
179 |
+
config.number_threads = 200
|
180 |
+
config.request_timeout = 3
|
181 |
+
|
182 |
+
## ............................................... ##
|
183 |
# Initialize the DataFrame
|
184 |
+
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary'])
|
185 |
|
186 |
# Initialize your model and tokenizer
|
187 |
+
model, tokenizer = get_models_and_tokenizers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
+
## ............................................... ##
|
190 |
+
with st.container():
|
191 |
+
# Fetch news and process articles
|
192 |
+
if st.button('Fetch and Process News'):
|
193 |
+
|
194 |
+
# Your news retrieval code
|
195 |
+
google_news = GNews()
|
196 |
+
|
197 |
+
google_news.period = period # News from last 7 days
|
198 |
+
google_news.max_results = max_results # number of responses across a keyword
|
199 |
+
google_news.country = country # News from a specific country
|
200 |
+
google_news.language = language # News in a specific language
|
201 |
+
#google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
|
202 |
+
google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
|
203 |
+
google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
|
204 |
+
|
205 |
+
news = google_news.get_news(search_term)
|
206 |
+
|
207 |
+
## ............................................... ##,
|
208 |
+
# Progress bar for fetching and processing news
|
209 |
+
progress_bar = st.progress(0)
|
210 |
+
total_news = len(news)
|
211 |
+
|
212 |
+
# Your news retrieval code (assuming 'news' is a list of article URLs)
|
213 |
+
#for x in news:
|
214 |
+
for idx, x in enumerate(news):
|
215 |
+
result = process_article(x['url'], _config=config)
|
216 |
+
if result is not None:
|
217 |
+
publish_date, language, url, source_url, title, authors, keywords, text, summary = result
|
218 |
+
temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
|
219 |
+
'Text': [text], 'Summary': [summary]})
|
220 |
+
df = pd.concat([df, temp_df], ignore_index=True)
|
221 |
+
|
222 |
+
# Update the progress bar
|
223 |
+
progress = (idx + 1) / total_news
|
224 |
+
progress_bar.progress(progress)
|
225 |
+
|
226 |
+
# Conditionally apply translation function to the 'Translation' column
|
227 |
+
if include_translation:
|
228 |
+
df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
|
229 |
+
|
230 |
+
# Conditionally apply sentiment analysis function to the 'Translation' column
|
231 |
+
if include_sentiment_analysis:
|
232 |
+
df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
|
233 |
+
|
234 |
+
|
235 |
+
# Conditionally apply VADER sentiment analysis to the 'Translation' column
|
236 |
+
if include_sentiment_vader:
|
237 |
+
df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
|
238 |
+
|
239 |
+
# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
|
240 |
+
if include_sentiment_textblob:
|
241 |
+
df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
|
242 |
+
|
243 |
+
# Set data_processed to True when the data has been successfully processed
|
244 |
+
data_processed = True
|
245 |
+
|
246 |
+
## ............................................... ##
|
247 |
+
# Add a button to download the data as a CSV file
|
248 |
+
if data_processed:
|
249 |
+
st.markdown("### Download Processed Data as CSV")
|
250 |
+
st.write("Click the button below to download the processed data as a CSV file.")
|
251 |
+
|
252 |
+
# Create a downloadable link
|
253 |
+
csv_data = df.to_csv(index=False).encode()
|
254 |
+
st.download_button(
|
255 |
+
label="Download CSV",
|
256 |
+
data=csv_data,
|
257 |
+
file_name="processed_data.csv",
|
258 |
+
)
|
259 |
+
|
260 |
+
with st.expander("See Table"):
|
261 |
+
## ............................................... ##
|
262 |
+
# Display processed data
|
263 |
+
if data_processed:
|
264 |
+
st.dataframe(df)
|
265 |
+
|
266 |
+
with st.expander("See EDA"):
|
267 |
+
## ............................................... ##
|
268 |
+
# Display processed data
|
269 |
+
if data_processed:
|
270 |
+
pr = ProfileReport(df)
|
271 |
+
st_profile_report(pr)
|