Update pages/News_Scrapping.py
Browse files- pages/News_Scrapping.py +14 -12
pages/News_Scrapping.py
CHANGED
@@ -19,9 +19,6 @@ st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rock
|
|
19 |
# Set up logging
|
20 |
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
21 |
|
22 |
-
# Initialize the DataFrame
|
23 |
-
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])
|
24 |
-
|
25 |
## ............................................... ##
|
26 |
# Function for translation
|
27 |
def translate_text(text, source='auto', target='en'):
|
@@ -89,12 +86,10 @@ def process_article(url, config):
|
|
89 |
keywords = article.keywords
|
90 |
summary = article.summary
|
91 |
|
92 |
-
concated_text = title
|
93 |
language = detect(concated_text)
|
94 |
-
tl = translate_text(concated_text, source=language, target='en')
|
95 |
-
status, predict = predict_sentiment(tl, model, tokenizer)
|
96 |
|
97 |
-
return publish_date, language, url, title, authors, keywords, summary, text
|
98 |
except Exception as e:
|
99 |
logging.error(f"Article processing error: {str(e)}")
|
100 |
return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
|
@@ -115,6 +110,15 @@ start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
|
|
115 |
# Create a variable to track whether the data has been processed
|
116 |
data_processed = False
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
# Fetch news and process articles
|
119 |
if st.button('Fetch and Process News'):
|
120 |
|
@@ -128,18 +132,16 @@ if st.button('Fetch and Process News'):
|
|
128 |
|
129 |
news = google_news.get_news(search_term)
|
130 |
|
131 |
-
# Initialize your model and tokenizer
|
132 |
-
model, tokenizer = get_models_and_tokenizers()
|
133 |
-
|
134 |
# Create a custom configuration to disable SSL certificate verification
|
135 |
config = Config()
|
136 |
config.ignore_ssl = True
|
137 |
|
138 |
# Process articles
|
139 |
for x in news:
|
140 |
-
publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
|
|
|
141 |
temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
|
142 |
-
'Summary': [summary], 'Text': [text]
|
143 |
|
144 |
df = pd.concat([df, temp_df], ignore_index=True)
|
145 |
|
|
|
19 |
# Set up logging
|
20 |
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
21 |
|
|
|
|
|
|
|
22 |
## ............................................... ##
|
23 |
# Function for translation
|
24 |
def translate_text(text, source='auto', target='en'):
|
|
|
86 |
keywords = article.keywords
|
87 |
summary = article.summary
|
88 |
|
89 |
+
concated_text = title #+ '| ' + summary
|
90 |
language = detect(concated_text)
|
|
|
|
|
91 |
|
92 |
+
return publish_date, language, url, title, authors, keywords, summary, text
|
93 |
except Exception as e:
|
94 |
logging.error(f"Article processing error: {str(e)}")
|
95 |
return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
|
|
|
110 |
# Create a variable to track whether the data has been processed
|
111 |
data_processed = False
|
112 |
|
113 |
+
# Initialize the DataFrame
|
114 |
+
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text']) #, 'Translate', 'Status', 'Sentiment'])
|
115 |
+
|
116 |
+
# Initialize your model and tokenizer
|
117 |
+
#model, tokenizer = get_models_and_tokenizers()
|
118 |
+
|
119 |
+
#tl = translate_text(concated_text, source=language, target='en')
|
120 |
+
#status, predict = predict_sentiment(tl, model, tokenizer)
|
121 |
+
|
122 |
# Fetch news and process articles
|
123 |
if st.button('Fetch and Process News'):
|
124 |
|
|
|
132 |
|
133 |
news = google_news.get_news(search_term)
|
134 |
|
|
|
|
|
|
|
135 |
# Create a custom configuration to disable SSL certificate verification
|
136 |
config = Config()
|
137 |
config.ignore_ssl = True
|
138 |
|
139 |
# Process articles
|
140 |
for x in news:
|
141 |
+
#publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
|
142 |
+
publish_date, language, url, title, authors, keywords, summary, text = process_article(x['url'], config)
|
143 |
temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
|
144 |
+
'Summary': [summary], 'Text': [text]}) #, 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
|
145 |
|
146 |
df = pd.concat([df, temp_df], ignore_index=True)
|
147 |
|