# Imports and Setup

In [None]:
import pandas as pd
pd.set_option('max_colwidth',150)
import numpy as np
import os
from datetime import datetime as dt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # allow multiple outputs in a cell
import warnings
warnings.filterwarnings("ignore")
import pickle

In [None]:
# Loading NLTK Modules
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


#Loading all news articles row data

This section loads the data used in the GLG project from the data_store in external Source. If you are using the notebook locally do not run the following cells and define the paths to the data.

In [None]:
def download_dataset():
  
  if not os.path.isfile('all-the-news-2-1.zip?dl=0'):

    # Downloading Annotated Corpus for all-news-article dataset
    !wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip?dl=0

  if not os.path.exists("/content/data"):
    # Make a data folder to store the data
    !mkdir data

    !unzip /content/all-the-news-2-1.zip?dl=0 
    !mv /content/all-the-news-2-1.csv ./data

    !rm /content/all-the-news-2-1.zip?dl=0
download_dataset()

In [None]:
class newsArticleDataCleaner:

  '''

  This class can be used online (in colab) or offline (locally):
  1. Online:
    If using this class in Colab and downloading the data from external source using the code
    in the notebook only run download_dataset function above in the code cell.
  2. Offline:
    If using this class to process news article data available in a local directory,
    "data_path" parameter should be defined.
    Where "data_path" is the path to the folder containing all news articles datasets
    datasets:

    Parameters:
    -----------

    data_path: str
      the path to 'all-the-news-2-1.csv' data if the data is downloaded from GDrive or other location.

  '''
  def __init__(self,data_path = None):

    self.data_path = data_path
    self.path = self.get_file_path()
    
  def get_file_path(self):
    '''
    Get file path of news article.
      
    '''

    if self.data_path is None:
      file_path = '/content/data/all-the-news-2-1.csv'
    else:
      file_path = self.data_path    

    return file_path

  def filter_section(self, section):

    if str(section).lower().startswith('tech') :
      return 'technology'
    elif str(section).lower().startswith('health'):
      return 'health'

    return 'other'

  def extract_health_tech_data(self):

    """
    Read dataframe, then:
    1. Add Extra column 'tech_health_tag' inorder to identify tech and health documents based on section column
    2. Load only news article focus on Technology and Health section
    """
    
    data = pd.read_csv(self.path, encoding = "utf-8")
    
    #Add tech_health_tag column which identify the documents belong to either 
    #health or technology section 
    data['tech_health_tag'] = data['section'].apply(self.filter_section)
    #Load news article focus on Technology and Health section
    data_tech_health = data[(data['tech_health_tag']=='technology') |  (data['tech_health_tag']=='health')]  
    self.data_tech_health = data_tech_health

    return data_tech_health


  def clean(self):

    """
    1. Call extract_health_tech_data() function
    2. Filter health and tech data based on document word length
    3. Delet rows with null value for Article column
    4. Remove columns if it has more than 20% null value
    5. Reset index
    6. Make all columns lowe case
    7. Apply lemmatization, punctuation and stop words removal
    """
    
    data = self.extract_health_tech_data()
    data['article_word_len'] = data['article'].apply(lambda x: len(str(x).split()))
    # From data exploration steps we know 95% of our artcle data is less than 1340 
    data = data[(data['article_word_len']>=50) & (data['article_word_len']<=1340) ]
    # delete all rows with column 'article' has null value 
    indexArticle = data[data['article'].isnull()].index
    data.drop(indexArticle , inplace=True)
    # delete columns if it has more than 20% null value
    missing_cols = data.isnull().sum()
    drop_missing_cols = missing_cols[(missing_cols > len(data)/20)].sort_values()
    data = data.drop(drop_missing_cols.index, axis=1)
    data = data.dropna()
    data['date'] = pd.to_datetime(data['date'])
    # reset index
    data = data.reset_index(drop=True)
    # make all columns lower_case 
    data.columns = data.columns.str.lower()
    tech_data_sample = data[data['tech_health_tag']=='technology'].sample(n=5500, random_state=1)
    health_data_sample = data[data['tech_health_tag']=='health'].sample(n=5500, random_state=1)
    working_data = pd.concat([tech_data_sample, health_data_sample], ignore_index=True)
    return working_data

# News Articl Data Cleaner Class

1. Create a newsArticleDataCleaner class object

- Note:during creating class object you should specify the path where to download and store all-the news-2-1.csv row data. If you are working on google colab you can specify `path_file = '/content/data/all-the-news-2-1.csv`'.




In [None]:
# Creating class object
path_file = '/content/data/all-the-news-2-1.csv'
article_obj = newsArticleDataCleaner(path_file)

In [None]:
# calling Clean method in the class
# returns a processed dataframe 
df = article_obj.clean()

In [None]:
# Specify the location according to your working enviroments to save the dataframe
df.to_csv('/content/drive/MyDrive/data_tech_health.csv', sep=',', index=False)

In [None]:
df.head()

In [None]:
df['article_word_len'].describe([0.1,0.25,0.5,0.75,0.95])

count    11000.000000
mean       442.630636
std        312.645494
min         50.000000
10%         81.000000
25%        190.000000
50%        385.000000
75%        623.000000
95%       1092.000000
max       1340.000000
Name: article_word_len, dtype: float64