Bayhaqy commited on
Commit
662dfae
1 Parent(s): d212f62

Update pages/News_Scrapping.py

Browse files
Files changed (1) hide show
  1. pages/News_Scrapping.py +203 -99
pages/News_Scrapping.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  from newspaper import Article, Config
@@ -8,6 +10,9 @@ import torch
8
  import requests
9
  import logging
10
  from gnews import GNews
 
 
 
11
  import nltk
12
  nltk.download('punkt')
13
 
@@ -15,153 +20,252 @@ nltk.download('punkt')
15
  # Set page configuration (Call this once and make changes as needed)
16
  st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
17
 
 
 
 
 
 
18
  ## ............................................... ##
19
  # Set up logging
20
  logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
 
22
  ## ............................................... ##
23
- # Function for translation
24
- def translate_text(text, source='auto', target='en'):
25
- try:
26
- if source != target:
27
- text = GoogleTranslator(source=source, target=target).translate(text)
28
- return text
29
- except Exception as e:
30
- logging.error(f"Translation error: {str(e)}")
31
- return text
32
 
33
  # Function for sentiment analysis
34
- def predict_sentiment(text, model, tokenizer):
 
35
  try:
36
- tokens_info = tokenizer(text, truncation=True, return_tensors="pt")
37
  with torch.no_grad():
38
- raw_predictions = model(**tokens_info).logits
39
 
40
  predicted_class_id = raw_predictions.argmax().item()
41
- predict = model.config.id2label[predicted_class_id]
42
 
43
  softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
44
-
45
  if (softmaxed > 70):
46
  status = 'Not trust'
47
  elif (softmaxed > 40):
48
  status = 'Not sure'
49
  else:
50
  status = 'Trust'
51
-
52
  return status, predict
 
53
  except Exception as e:
54
  logging.error(f"Sentiment analysis error: {str(e)}")
55
  return 'N/A', 'N/A'
56
 
57
- def get_models_and_tokenizers():
58
- model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
59
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
60
- model.eval()
61
- tokenizer = AutoTokenizer.from_pretrained(model_name)
62
-
63
- return model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
 
65
  # Function to process an article
66
- def process_article(url, config):
 
67
  try:
68
- article = Article(url=url, config=config)
69
  article.download()
70
  article.parse()
71
 
72
- # Get the article data
73
- title = article.title
74
- authors = article.authors
75
-
76
- publish_date = article.publish_date
77
 
78
- # Check if publish_date is not None before accessing 'value'
79
- if publish_date is not None:
80
- publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
81
- else:
82
- publish_date = 'N/A'
83
 
 
84
  text = article.text
 
 
 
 
 
 
 
 
85
  article.nlp()
86
- keywords = article.keywords
87
  summary = article.summary
88
 
89
- concated_text = title #+ '| ' + summary
90
- language = detect(concated_text)
 
91
 
92
- return publish_date, language, url, title, authors, keywords, summary, text
93
  except Exception as e:
94
  logging.error(f"Article processing error: {str(e)}")
95
- return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  ## ............................................... ##
98
- # Initialize Streamlit app
99
- st.title('News Article Scrapping')
100
- st.write("Created by Bayhaqy")
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  ## ............................................... ##
103
- # Input search parameters
104
- search_term = st.text_input('Enter a search term:', 'palestina')
105
- max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
106
- country = st.text_input('Country:', 'Indonesia')
107
- language = st.text_input('Language:', 'indonesian')
108
- start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
109
 
 
 
 
 
 
 
 
 
 
110
  # Create a variable to track whether the data has been processed
111
  data_processed = False
112
 
 
 
 
 
 
 
 
113
  # Initialize the DataFrame
114
- df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text']) #, 'Translate', 'Status', 'Sentiment'])
115
 
116
  # Initialize your model and tokenizer
117
- #model, tokenizer = get_models_and_tokenizers()
118
-
119
- #tl = translate_text(concated_text, source=language, target='en')
120
- #status, predict = predict_sentiment(tl, model, tokenizer)
121
-
122
- # Fetch news and process articles
123
- if st.button('Fetch and Process News'):
124
-
125
- # Your news retrieval code
126
- google_news = GNews()
127
-
128
- google_news.max_results = max_results
129
- google_news.country = country
130
- google_news.language = language
131
- google_news.start_date = (start_date.year, start_date.month, start_date.day)
132
-
133
- news = google_news.get_news(search_term)
134
 
135
- # Create a custom configuration to disable SSL certificate verification
136
- config = Config()
137
- config.ignore_ssl = True
138
-
139
- # Process articles
140
- for x in news:
141
- #publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
142
- publish_date, language, url, title, authors, keywords, summary, text = process_article(x['url'], config)
143
- temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
144
- 'Summary': [summary], 'Text': [text]}) #, 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
145
-
146
- df = pd.concat([df, temp_df], ignore_index=True)
147
-
148
- # Set data_processed to True when the data has been successfully processed
149
- data_processed = True
150
-
151
-
152
- # Add a button to download the data as a CSV file
153
- if data_processed:
154
- st.markdown("### Download Processed Data as CSV")
155
- st.write("Click the button below to download the processed data as a CSV file.")
156
-
157
- # Create a downloadable link
158
- csv_data = df.to_csv(index=False).encode()
159
- st.download_button(
160
- label="Download CSV",
161
- data=csv_data,
162
- file_name="processed_data.csv",
163
- )
164
-
165
- # Display processed data
166
- if data_processed:
167
- st.write(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from streamlit_pandas_profiling import st_profile_report
2
+ from ydata_profiling import ProfileReport
3
  import streamlit as st
4
  import pandas as pd
5
  from newspaper import Article, Config
 
10
  import requests
11
  import logging
12
  from gnews import GNews
13
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
14
+ from textblob import TextBlob
15
+ from urllib.parse import urlparse
16
  import nltk
17
  nltk.download('punkt')
18
 
 
20
  # Set page configuration (Call this once and make changes as needed)
21
  st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
22
 
23
+ with st.container():
24
+ # Initialize Streamlit app
25
+ st.title('News Article Scrapping')
26
+ st.write("Created by Bayhaqy")
27
+
28
  ## ............................................... ##
29
  # Set up logging
30
  logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31
 
32
  ## ............................................... ##
33
+ # Function for get model and tokenize
34
+ @st.cache_resource
35
+ def get_models_and_tokenizers():
36
+ model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
39
+ #model.eval()
40
+
41
+ return model, tokenizer
42
 
43
  # Function for sentiment analysis
44
+ @st.cache_resource
45
+ def analyze_sentiment_distilbert(text, _model, _tokenizer):
46
  try:
47
+ tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
48
  with torch.no_grad():
49
+ raw_predictions = _model(**tokens_info).logits
50
 
51
  predicted_class_id = raw_predictions.argmax().item()
52
+ predict = _model.config.id2label[predicted_class_id]
53
 
54
  softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
 
55
  if (softmaxed > 70):
56
  status = 'Not trust'
57
  elif (softmaxed > 40):
58
  status = 'Not sure'
59
  else:
60
  status = 'Trust'
 
61
  return status, predict
62
+
63
  except Exception as e:
64
  logging.error(f"Sentiment analysis error: {str(e)}")
65
  return 'N/A', 'N/A'
66
 
67
+ # Function for sentiment analysis using VADER
68
+ @st.cache_data
69
+ def analyze_sentiment_vader(text):
70
+ analyzer = SentimentIntensityAnalyzer()
71
+ sentiment = analyzer.polarity_scores(text)
72
+ compound_score = sentiment['compound']
73
+ if compound_score >= 0.05:
74
+ return 'Positive'
75
+ elif compound_score <= -0.05:
76
+ return 'Negative'
77
+ else:
78
+ return 'Neutral'
79
+
80
+ # Function for sentiment analysis using TextBlob
81
+ @st.cache_data
82
+ def analyze_sentiment_textblob(text):
83
+ analysis = TextBlob(text)
84
+ polarity = analysis.sentiment.polarity
85
+ if polarity > 0:
86
+ return 'Positive'
87
+ elif polarity < 0:
88
+ return 'Negative'
89
+ else:
90
+ return 'Neutral'
91
 
92
+ ## ............................................... ##
93
  # Function to process an article
94
+ @st.cache_data
95
+ def process_article(url, _config):
96
  try:
97
+ article = Article(url=url, config=_config)
98
  article.download()
99
  article.parse()
100
 
101
+ # Check if publish_date is not None before further processing
102
+ if article.publish_date is None:
103
+ return None # Skip processing and return None
 
 
104
 
105
+ # Check if text is not None before further processing
106
+ if len(article.text) <= 5:
107
+ return None # Skip processing and return None
 
 
108
 
109
+ # Get the article data if publish_date is not not None
110
  text = article.text
111
+ url = article.canonical_link
112
+ source_url = urlparse(url).netloc
113
+
114
+ title = article.title
115
+ authors = article.authors
116
+ #publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
117
+ publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')
118
+
119
  article.nlp()
120
+ keywords = article.meta_keywords
121
  summary = article.summary
122
 
123
+ language = detect(title)
124
+
125
+ return publish_date, language, url, source_url, title, authors, keywords, text, summary
126
 
 
127
  except Exception as e:
128
  logging.error(f"Article processing error: {str(e)}")
129
+ return None # Skip processing and return None
130
+
131
+ # Function for translation
132
+ @st.cache_data
133
+ def translate_text(text, source='auto', target='en'):
134
+ try:
135
+ if source != target:
136
+ text = GoogleTranslator(source=source, target=target).translate(text)
137
+ return text
138
+
139
+ except Exception as e:
140
+ logging.error(f"Translation error: {str(e)}")
141
+ return text
142
 
143
  ## ............................................... ##
144
+ with st.container():
145
+ # Input search parameters
146
+ search_term = st.text_input('Enter a search term :', 'Indonesia')
147
+
148
+ col1, col2, col3 = st.columns(3)
149
+
150
+ with col1:
151
+ period = st.text_input('Enter a news period :', '7d')
152
+ max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
153
+ with col2:
154
+ country = st.text_input('Country :', 'Indonesia')
155
+ language = st.text_input('Language :', 'indonesian')
156
+ with col3:
157
+ start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
158
+ end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
159
 
160
  ## ............................................... ##
161
+ with st.container():
162
+ col1, col2 = st.columns(2)
 
 
 
 
163
 
164
+ with col1:
165
+ # Checkbox options for different processing steps
166
+ include_translation = st.checkbox("Include Translation", value=False)
167
+ include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
168
+ with col2:
169
+ include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
170
+ include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
171
+
172
+ ## ............................................... ##
173
  # Create a variable to track whether the data has been processed
174
  data_processed = False
175
 
176
+ ## ............................................... ##
177
+ # Create a custom configuration
178
+ config = Config()
179
+ config.number_threads = 200
180
+ config.request_timeout = 3
181
+
182
+ ## ............................................... ##
183
  # Initialize the DataFrame
184
+ df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary'])
185
 
186
  # Initialize your model and tokenizer
187
+ model, tokenizer = get_models_and_tokenizers()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ ## ............................................... ##
190
+ with st.container():
191
+ # Fetch news and process articles
192
+ if st.button('Fetch and Process News'):
193
+
194
+ # Your news retrieval code
195
+ google_news = GNews()
196
+
197
+ google_news.period = period # News from last 7 days
198
+ google_news.max_results = max_results # number of responses across a keyword
199
+ google_news.country = country # News from a specific country
200
+ google_news.language = language # News in a specific language
201
+ #google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
202
+ google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
203
+ google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
204
+
205
+ news = google_news.get_news(search_term)
206
+
207
+ ## ............................................... ##,
208
+ # Progress bar for fetching and processing news
209
+ progress_bar = st.progress(0)
210
+ total_news = len(news)
211
+
212
+ # Your news retrieval code (assuming 'news' is a list of article URLs)
213
+ #for x in news:
214
+ for idx, x in enumerate(news):
215
+ result = process_article(x['url'], _config=config)
216
+ if result is not None:
217
+ publish_date, language, url, source_url, title, authors, keywords, text, summary = result
218
+ temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
219
+ 'Text': [text], 'Summary': [summary]})
220
+ df = pd.concat([df, temp_df], ignore_index=True)
221
+
222
+ # Update the progress bar
223
+ progress = (idx + 1) / total_news
224
+ progress_bar.progress(progress)
225
+
226
+ # Conditionally apply translation function to the 'Translation' column
227
+ if include_translation:
228
+ df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
229
+
230
+ # Conditionally apply sentiment analysis function to the 'Translation' column
231
+ if include_sentiment_analysis:
232
+ df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
233
+
234
+
235
+ # Conditionally apply VADER sentiment analysis to the 'Translation' column
236
+ if include_sentiment_vader:
237
+ df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
238
+
239
+ # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
240
+ if include_sentiment_textblob:
241
+ df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
242
+
243
+ # Set data_processed to True when the data has been successfully processed
244
+ data_processed = True
245
+
246
+ ## ............................................... ##
247
+ # Add a button to download the data as a CSV file
248
+ if data_processed:
249
+ st.markdown("### Download Processed Data as CSV")
250
+ st.write("Click the button below to download the processed data as a CSV file.")
251
+
252
+ # Create a downloadable link
253
+ csv_data = df.to_csv(index=False).encode()
254
+ st.download_button(
255
+ label="Download CSV",
256
+ data=csv_data,
257
+ file_name="processed_data.csv",
258
+ )
259
+
260
+ with st.expander("See Table"):
261
+ ## ............................................... ##
262
+ # Display processed data
263
+ if data_processed:
264
+ st.dataframe(df)
265
+
266
+ with st.expander("See EDA"):
267
+ ## ............................................... ##
268
+ # Display processed data
269
+ if data_processed:
270
+ pr = ProfileReport(df)
271
+ st_profile_report(pr)