Bayhaqy commited on
Commit
d212f62
1 Parent(s): 9177866

Update pages/News_Scrapping.py

Browse files
Files changed (1) hide show
  1. pages/News_Scrapping.py +14 -12
pages/News_Scrapping.py CHANGED
@@ -19,9 +19,6 @@ st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rock
19
  # Set up logging
20
  logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
 
22
- # Initialize the DataFrame
23
- df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])
24
-
25
  ## ............................................... ##
26
  # Function for translation
27
  def translate_text(text, source='auto', target='en'):
@@ -89,12 +86,10 @@ def process_article(url, config):
89
  keywords = article.keywords
90
  summary = article.summary
91
 
92
- concated_text = title + '| ' + summary
93
  language = detect(concated_text)
94
- tl = translate_text(concated_text, source=language, target='en')
95
- status, predict = predict_sentiment(tl, model, tokenizer)
96
 
97
- return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict
98
  except Exception as e:
99
  logging.error(f"Article processing error: {str(e)}")
100
  return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
@@ -115,6 +110,15 @@ start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
115
  # Create a variable to track whether the data has been processed
116
  data_processed = False
117
 
 
 
 
 
 
 
 
 
 
118
  # Fetch news and process articles
119
  if st.button('Fetch and Process News'):
120
 
@@ -128,18 +132,16 @@ if st.button('Fetch and Process News'):
128
 
129
  news = google_news.get_news(search_term)
130
 
131
- # Initialize your model and tokenizer
132
- model, tokenizer = get_models_and_tokenizers()
133
-
134
  # Create a custom configuration to disable SSL certificate verification
135
  config = Config()
136
  config.ignore_ssl = True
137
 
138
  # Process articles
139
  for x in news:
140
- publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
 
141
  temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
142
- 'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
143
 
144
  df = pd.concat([df, temp_df], ignore_index=True)
145
 
 
19
  # Set up logging
20
  logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
 
 
 
 
22
  ## ............................................... ##
23
  # Function for translation
24
  def translate_text(text, source='auto', target='en'):
 
86
  keywords = article.keywords
87
  summary = article.summary
88
 
89
+ concated_text = title #+ '| ' + summary
90
  language = detect(concated_text)
 
 
91
 
92
+ return publish_date, language, url, title, authors, keywords, summary, text
93
  except Exception as e:
94
  logging.error(f"Article processing error: {str(e)}")
95
  return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
 
110
  # Create a variable to track whether the data has been processed
111
  data_processed = False
112
 
113
+ # Initialize the DataFrame
114
+ df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text']) #, 'Translate', 'Status', 'Sentiment'])
115
+
116
+ # Initialize your model and tokenizer
117
+ #model, tokenizer = get_models_and_tokenizers()
118
+
119
+ #tl = translate_text(concated_text, source=language, target='en')
120
+ #status, predict = predict_sentiment(tl, model, tokenizer)
121
+
122
  # Fetch news and process articles
123
  if st.button('Fetch and Process News'):
124
 
 
132
 
133
  news = google_news.get_news(search_term)
134
 
 
 
 
135
  # Create a custom configuration to disable SSL certificate verification
136
  config = Config()
137
  config.ignore_ssl = True
138
 
139
  # Process articles
140
  for x in news:
141
+ #publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
142
+ publish_date, language, url, title, authors, keywords, summary, text = process_article(x['url'], config)
143
  temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
144
+ 'Summary': [summary], 'Text': [text]}) #, 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
145
 
146
  df = pd.concat([df, temp_df], ignore_index=True)
147