Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Running

App Files Files Community

Bayhaqy commited on Oct 12, 2023

Commit

d212f62

•

1 Parent(s): 9177866

Update pages/News_Scrapping.py

Browse files

Files changed (1) hide show

pages/News_Scrapping.py +14 -12

pages/News_Scrapping.py CHANGED Viewed

@@ -19,9 +19,6 @@ st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rock
 # Set up logging
 logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-# Initialize the DataFrame
-df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])
 ## ............................................... ##
 # Function for translation
 def translate_text(text, source='auto', target='en'):
@@ -89,12 +86,10 @@ def process_article(url, config):
         keywords = article.keywords
         summary = article.summary
-        concated_text = title + '| ' + summary
         language = detect(concated_text)
-        tl = translate_text(concated_text, source=language, target='en')
-        status, predict = predict_sentiment(tl, model, tokenizer)
-        return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict
     except Exception as e:
         logging.error(f"Article processing error: {str(e)}")
         return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
@@ -115,6 +110,15 @@ start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
 # Create a variable to track whether the data has been processed
 data_processed = False
 # Fetch news and process articles
 if st.button('Fetch and Process News'):
@@ -128,18 +132,16 @@ if st.button('Fetch and Process News'):
     news = google_news.get_news(search_term)
-    # Initialize your model and tokenizer
-    model, tokenizer = get_models_and_tokenizers()
     # Create a custom configuration to disable SSL certificate verification
     config = Config()
     config.ignore_ssl = True
     # Process articles
     for x in news:
-        publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
         temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
-                                'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
         df = pd.concat([df, temp_df], ignore_index=True)

 # Set up logging
 logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 ## ............................................... ##
 # Function for translation
 def translate_text(text, source='auto', target='en'):
         keywords = article.keywords
         summary = article.summary
+        concated_text = title #+ '| ' + summary
         language = detect(concated_text)
+        return publish_date, language, url, title, authors, keywords, summary, text
     except Exception as e:
         logging.error(f"Article processing error: {str(e)}")
         return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
 # Create a variable to track whether the data has been processed
 data_processed = False
+# Initialize the DataFrame
+df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text']) #, 'Translate', 'Status', 'Sentiment'])
+# Initialize your model and tokenizer
+#model, tokenizer = get_models_and_tokenizers()
+#tl = translate_text(concated_text, source=language, target='en')
+#status, predict = predict_sentiment(tl, model, tokenizer)
 # Fetch news and process articles
 if st.button('Fetch and Process News'):
     news = google_news.get_news(search_term)
     # Create a custom configuration to disable SSL certificate verification
     config = Config()
     config.ignore_ssl = True
     # Process articles
     for x in news:
+        #publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
+        publish_date, language, url, title, authors, keywords, summary, text = process_article(x['url'], config)
         temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
+                                'Summary': [summary], 'Text': [text]}) #, 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
         df = pd.concat([df, temp_df], ignore_index=True)