Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Running

App Files Files Community

Bayhaqy commited on Oct 12, 2023

Commit

662dfae

•

1 Parent(s): d212f62

Update pages/News_Scrapping.py

Browse files

Files changed (1) hide show

pages/News_Scrapping.py +203 -99

pages/News_Scrapping.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import streamlit as st
 import pandas as pd
 from newspaper import Article, Config
@@ -8,6 +10,9 @@ import torch
 import requests
 import logging
 from gnews import GNews
 import nltk
 nltk.download('punkt')
@@ -15,153 +20,252 @@ nltk.download('punkt')
 # Set page configuration (Call this once and make changes as needed)
 st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')
 ## ............................................... ##
 # Set up logging
 logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 ## ............................................... ##
-# Function for translation
-def translate_text(text, source='auto', target='en'):
-    try:
-        if source != target:
-            text = GoogleTranslator(source=source, target=target).translate(text)
-        return text
-    except Exception as e:
-        logging.error(f"Translation error: {str(e)}")
-        return text
 # Function for sentiment analysis
-def predict_sentiment(text, model, tokenizer):
     try:
-        tokens_info = tokenizer(text, truncation=True, return_tensors="pt")
         with torch.no_grad():
-            raw_predictions = model(**tokens_info).logits
         predicted_class_id = raw_predictions.argmax().item()
-        predict = model.config.id2label[predicted_class_id]
         softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
         if (softmaxed > 70):
             status = 'Not trust'
         elif (softmaxed > 40):
             status = 'Not sure'
         else:
             status = 'Trust'
         return status, predict
     except Exception as e:
         logging.error(f"Sentiment analysis error: {str(e)}")
         return 'N/A', 'N/A'
-def get_models_and_tokenizers():
-    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
-    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
-    model.eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    return model, tokenizer
 # Function to process an article
-def process_article(url, config):
     try:
-        article = Article(url=url, config=config)
         article.download()
         article.parse()
-        # Get the article data
-        title = article.title
-        authors = article.authors
-        publish_date = article.publish_date
-        # Check if publish_date is not None before accessing 'value'
-        if publish_date is not None:
-            publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
-        else:
-            publish_date = 'N/A'
         text = article.text
         article.nlp()
-        keywords = article.keywords
         summary = article.summary
-        concated_text = title #+ '| ' + summary
-        language = detect(concated_text)
-        return publish_date, language, url, title, authors, keywords, summary, text
     except Exception as e:
         logging.error(f"Article processing error: {str(e)}")
-        return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
 ## ............................................... ##
-# Initialize Streamlit app
-st.title('News Article Scrapping')
-st.write("Created by Bayhaqy")
 ## ............................................... ##
-# Input search parameters
-search_term = st.text_input('Enter a search term:', 'palestina')
-max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
-country = st.text_input('Country:', 'Indonesia')
-language = st.text_input('Language:', 'indonesian')
-start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
 # Create a variable to track whether the data has been processed
 data_processed = False
 # Initialize the DataFrame
-df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text']) #, 'Translate', 'Status', 'Sentiment'])
 # Initialize your model and tokenizer
-#model, tokenizer = get_models_and_tokenizers()
-#tl = translate_text(concated_text, source=language, target='en')
-#status, predict = predict_sentiment(tl, model, tokenizer)
-# Fetch news and process articles
-if st.button('Fetch and Process News'):
-    # Your news retrieval code
-    google_news = GNews()
-    google_news.max_results = max_results
-    google_news.country = country
-    google_news.language = language
-    google_news.start_date = (start_date.year, start_date.month, start_date.day)
-    news = google_news.get_news(search_term)
-    # Create a custom configuration to disable SSL certificate verification
-    config = Config()
-    config.ignore_ssl = True
-    # Process articles
-    for x in news:
-        #publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
-        publish_date, language, url, title, authors, keywords, summary, text = process_article(x['url'], config)
-        temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
-                                'Summary': [summary], 'Text': [text]}) #, 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
-        df = pd.concat([df, temp_df], ignore_index=True)
-    # Set data_processed to True when the data has been successfully processed
-    data_processed = True
-# Add a button to download the data as a CSV file
-if data_processed:
-    st.markdown("### Download Processed Data as CSV")
-    st.write("Click the button below to download the processed data as a CSV file.")
-    # Create a downloadable link
-    csv_data = df.to_csv(index=False).encode()
-    st.download_button(
-        label="Download CSV",
-        data=csv_data,
-        file_name="processed_data.csv",
-    )
-# Display processed data
-if data_processed:
-    st.write(df.head())

+from streamlit_pandas_profiling import st_profile_report
+from ydata_profiling import ProfileReport
 import streamlit as st
 import pandas as pd
 from newspaper import Article, Config
 import requests
 import logging
 from gnews import GNews
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from textblob import TextBlob
+from urllib.parse import urlparse
 import nltk
 nltk.download('punkt')
 # Set page configuration (Call this once and make changes as needed)
 st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')
+with st.container():
+  # Initialize Streamlit app
+  st.title('News Article Scrapping')
+  st.write("Created by Bayhaqy")
 ## ............................................... ##
 # Set up logging
 logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 ## ............................................... ##
+# Function for get model and tokenize
+@st.cache_resource
+def get_models_and_tokenizers():
+    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    #model.eval()
+    return model, tokenizer
 # Function for sentiment analysis
+@st.cache_resource
+def analyze_sentiment_distilbert(text, _model, _tokenizer):
     try:
+        tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
         with torch.no_grad():
+            raw_predictions = _model(**tokens_info).logits
         predicted_class_id = raw_predictions.argmax().item()
+        predict = _model.config.id2label[predicted_class_id]
         softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
         if (softmaxed > 70):
             status = 'Not trust'
         elif (softmaxed > 40):
             status = 'Not sure'
         else:
             status = 'Trust'
         return status, predict
     except Exception as e:
         logging.error(f"Sentiment analysis error: {str(e)}")
         return 'N/A', 'N/A'
+# Function for sentiment analysis using VADER
+@st.cache_data
+def analyze_sentiment_vader(text):
+    analyzer = SentimentIntensityAnalyzer()
+    sentiment = analyzer.polarity_scores(text)
+    compound_score = sentiment['compound']
+    if compound_score >= 0.05:
+        return 'Positive'
+    elif compound_score <= -0.05:
+        return 'Negative'
+    else:
+        return 'Neutral'
+# Function for sentiment analysis using TextBlob
+@st.cache_data
+def analyze_sentiment_textblob(text):
+    analysis = TextBlob(text)
+    polarity = analysis.sentiment.polarity
+    if polarity > 0:
+        return 'Positive'
+    elif polarity < 0:
+        return 'Negative'
+    else:
+        return 'Neutral'
+## ............................................... ##
 # Function to process an article
+@st.cache_data
+def process_article(url, _config):
     try:
+        article = Article(url=url, config=_config)
         article.download()
         article.parse()
+        # Check if publish_date is not None before further processing
+        if article.publish_date is None:
+            return None  # Skip processing and return None
+        # Check if text is not None before further processing
+        if len(article.text) <= 5:
+            return None  # Skip processing and return None
+        # Get the article data if publish_date is not not None
         text = article.text
+        url = article.canonical_link
+        source_url = urlparse(url).netloc
+        title = article.title
+        authors = article.authors
+        #publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
+        publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')
         article.nlp()
+        keywords = article.meta_keywords
         summary = article.summary
+        language = detect(title)
+        return publish_date, language, url, source_url, title, authors, keywords, text, summary
     except Exception as e:
         logging.error(f"Article processing error: {str(e)}")
+        return None  # Skip processing and return None
+# Function for translation
+@st.cache_data
+def translate_text(text, source='auto', target='en'):
+    try:
+        if source != target:
+            text = GoogleTranslator(source=source, target=target).translate(text)
+        return text
+    except Exception as e:
+        logging.error(f"Translation error: {str(e)}")
+        return text
 ## ............................................... ##
+with st.container():
+  # Input search parameters
+  search_term = st.text_input('Enter a search term :', 'Indonesia')
+  col1, col2, col3 = st.columns(3)
+  with col1:
+    period = st.text_input('Enter a news period :', '7d')
+    max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
+  with col2:
+    country = st.text_input('Country :', 'Indonesia')
+    language = st.text_input('Language :', 'indonesian')
+  with col3:
+    start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
+    end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
 ## ............................................... ##
+with st.container():
+  col1, col2 = st.columns(2)
+  with col1:
+    # Checkbox options for different processing steps
+    include_translation = st.checkbox("Include Translation", value=False)
+    include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
+  with col2:
+    include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
+    include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
+## ............................................... ##
 # Create a variable to track whether the data has been processed
 data_processed = False
+## ............................................... ##
+# Create a custom configuration
+config = Config()
+config.number_threads = 200
+config.request_timeout = 3
+## ............................................... ##
 # Initialize the DataFrame
+df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary'])
 # Initialize your model and tokenizer
+model, tokenizer = get_models_and_tokenizers()
+## ............................................... ##
+with st.container():
+  # Fetch news and process articles
+  if st.button('Fetch and Process News'):
+      # Your news retrieval code
+      google_news = GNews()
+      google_news.period = period  # News from last 7 days
+      google_news.max_results = max_results # number of responses across a keyword
+      google_news.country = country  # News from a specific country
+      google_news.language = language  # News in a specific language
+      #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
+      google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
+      google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
+      news = google_news.get_news(search_term)
+      ## ............................................... ##,
+      # Progress bar for fetching and processing news
+      progress_bar = st.progress(0)
+      total_news = len(news)
+      # Your news retrieval code (assuming 'news' is a list of article URLs)
+      #for x in news:
+      for idx, x in enumerate(news):
+          result = process_article(x['url'], _config=config)
+          if result is not None:
+              publish_date, language, url, source_url, title, authors, keywords, text, summary = result
+              temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
+                                      'Text': [text], 'Summary': [summary]})
+              df = pd.concat([df, temp_df], ignore_index=True)
+          # Update the progress bar
+          progress = (idx + 1) / total_news
+          progress_bar.progress(progress)
+      # Conditionally apply translation function to the 'Translation' column
+      if include_translation:
+          df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
+      # Conditionally apply sentiment analysis function to the 'Translation' column
+      if include_sentiment_analysis:
+          df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
+      # Conditionally apply VADER sentiment analysis to the 'Translation' column
+      if include_sentiment_vader:
+          df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
+      # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
+      if include_sentiment_textblob:
+          df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
+      # Set data_processed to True when the data has been successfully processed
+      data_processed = True
+  ## ............................................... ##
+  # Add a button to download the data as a CSV file
+  if data_processed:
+      st.markdown("### Download Processed Data as CSV")
+      st.write("Click the button below to download the processed data as a CSV file.")
+      # Create a downloadable link
+      csv_data = df.to_csv(index=False).encode()
+      st.download_button(
+          label="Download CSV",
+          data=csv_data,
+          file_name="processed_data.csv",
+      )
+  with st.expander("See Table"):
+    ## ............................................... ##
+    # Display processed data
+    if data_processed:
+        st.dataframe(df)
+  with st.expander("See EDA"):
+    ## ............................................... ##
+    # Display processed data
+    if data_processed:
+      pr = ProfileReport(df)
+      st_profile_report(pr)