import re import numpy as np from src.arcs import generate_arc import warnings import pandas as pd from configparser import ConfigParser, ExtendedInterpolation warnings.filterwarnings("ignore") def get_last_known_bounty(row): """get latest bounty for each character row """ if type(row) == float: return row elif type(row) == str: x = re.sub(r"\[.*?\]", " ", row) x = x.split(" ") ret = ''.join([n for n in x[0] if n.isdigit()]) if len(ret) ==0: return np.nan return int(ret) def get_latest_age(row): if type(row) == str: x = re.sub(r"\[.*?\]", " ", row) x = re.sub(r"\(.*?\)", " ", x) x = x.replace(";", "") x = x.split(" ") ret = ' '.join([n for n in x if n.isdigit()]) ret = ret.split(" ") newret = [] for i in ret: try: newret.append(int(i)) except: newret.append(i) return (max(newret)) def get_main_crew(row): if type(row) == str: x = re.sub(r"\[.*?\]", " ", row) x = re.sub(r"\(.*?\)", " ", x) x = x.split(";") # x = x.split("") return x[0] class cleaner: def __init__(self, config_path = 'cfg/cfg.ini'): pl_config = ConfigParser(interpolation=ExtendedInterpolation()) pl_config.read(config_path) self.end_chap = pl_config['SCRAPER'].getint('end_chap') + 1 self.char_link_fp = pl_config['SCRAPER'].get('char_link_fp') self.chap_appearance_fp = pl_config['SCRAPER'].get('chap_appearance_fp') self.char_details_fp = pl_config['SCRAPER'].get('char_details_fp') self.age_bounty_fp = pl_config['SCRAPER'].get('age_bounty_fp') self.arcs = generate_arc(self.end_chap) def arc_col(self,row): """function to generate arc per row for appearance df """ for key in self.arcs: if row['Chapter'] in self.arcs[key]: return key return "None" def preprocess_data(self): # preprocess to add arc appearance_df = pd.read_csv(self.chap_appearance_fp) # appearance_df['Chapter'] = appearance_df['Chapter'].ffill() # df['Arc Name'] = df['Arc Name'].ffill() appearance_df['Appearance'] = appearance_df['Character'].str.split("(",expand=True)[0] appearance_df['Appearance Notes'] = appearance_df['Character'].str.split("(",expand=True)[1] appearance_df['Appearance Notes'] = appearance_df['Appearance Notes'].str.replace(")", "", regex = True) appearance_df['Arc'] = appearance_df.apply(self.arc_col, axis =1) char_details_df = pd.read_csv(self.char_details_fp) char_details_df['last_bounty'] = char_details_df['bounty'].apply(get_last_known_bounty) char_details_df['latest_age'] = char_details_df['age'].apply(get_latest_age) char_details_df['latest_age']= char_details_df['latest_age'].fillna(value=np.nan) char_details_df['main_crew'] = char_details_df['affiliation'].apply(get_main_crew) df_age_bounty = char_details_df.dropna(subset=['latest_age', 'last_bounty']) df_age_bounty['latest_age'] = df_age_bounty['latest_age'].astype('int') appearance_df.to_csv(self.chap_appearance_fp, index = False) char_details_df.to_csv(self.char_details_fp, index = False) df_age_bounty.to_csv(self.age_bounty_fp, index = False) if __name__ == '__main__': cleaner = cleaner() cleaner.preprocess_data()