Spaces:
Sleeping
Sleeping
import re | |
import numpy as np | |
from src.arcs import generate_arc | |
import warnings | |
import pandas as pd | |
from configparser import ConfigParser, ExtendedInterpolation | |
warnings.filterwarnings("ignore") | |
def get_last_known_bounty(row): | |
"""get latest bounty for each character row | |
""" | |
if type(row) == float: | |
return row | |
elif type(row) == str: | |
x = re.sub(r"\[.*?\]", " ", row) | |
x = x.split(" ") | |
ret = ''.join([n for n in x[0] if n.isdigit()]) | |
if len(ret) ==0: | |
return np.nan | |
return int(ret) | |
def get_latest_age(row): | |
if type(row) == str: | |
x = re.sub(r"\[.*?\]", " ", row) | |
x = re.sub(r"\(.*?\)", " ", x) | |
x = x.replace(";", "") | |
x = x.split(" ") | |
ret = ' '.join([n for n in x if n.isdigit()]) | |
ret = ret.split(" ") | |
newret = [] | |
for i in ret: | |
try: | |
newret.append(int(i)) | |
except: | |
newret.append(i) | |
return (max(newret)) | |
def get_main_crew(row): | |
if type(row) == str: | |
x = re.sub(r"\[.*?\]", " ", row) | |
x = re.sub(r"\(.*?\)", " ", x) | |
x = x.split(";") | |
# x = x.split("") | |
return x[0] | |
class cleaner: | |
def __init__(self, config_path = 'cfg/cfg.ini'): | |
pl_config = ConfigParser(interpolation=ExtendedInterpolation()) | |
pl_config.read(config_path) | |
self.end_chap = pl_config['SCRAPER'].getint('end_chap') + 1 | |
self.char_link_fp = pl_config['SCRAPER'].get('char_link_fp') | |
self.chap_appearance_fp = pl_config['SCRAPER'].get('chap_appearance_fp') | |
self.char_details_fp = pl_config['SCRAPER'].get('char_details_fp') | |
self.age_bounty_fp = pl_config['SCRAPER'].get('age_bounty_fp') | |
self.arcs = generate_arc(self.end_chap) | |
def arc_col(self,row): | |
"""function to generate arc per row for appearance df | |
""" | |
for key in self.arcs: | |
if row['Chapter'] in self.arcs[key]: | |
return key | |
return "None" | |
def preprocess_data(self): | |
# preprocess to add arc | |
appearance_df = pd.read_csv(self.chap_appearance_fp) | |
# appearance_df['Chapter'] = appearance_df['Chapter'].ffill() | |
# df['Arc Name'] = df['Arc Name'].ffill() | |
appearance_df['Appearance'] = appearance_df['Character'].str.split("(",expand=True)[0] | |
appearance_df['Appearance Notes'] = appearance_df['Character'].str.split("(",expand=True)[1] | |
appearance_df['Appearance Notes'] = appearance_df['Appearance Notes'].str.replace(")", "", regex = True) | |
appearance_df['Arc'] = appearance_df.apply(self.arc_col, axis =1) | |
char_details_df = pd.read_csv(self.char_details_fp) | |
char_details_df['last_bounty'] = char_details_df['bounty'].apply(get_last_known_bounty) | |
char_details_df['latest_age'] = char_details_df['age'].apply(get_latest_age) | |
char_details_df['latest_age']= char_details_df['latest_age'].fillna(value=np.nan) | |
char_details_df['main_crew'] = char_details_df['affiliation'].apply(get_main_crew) | |
df_age_bounty = char_details_df.dropna(subset=['latest_age', 'last_bounty']) | |
df_age_bounty['latest_age'] = df_age_bounty['latest_age'].astype('int') | |
appearance_df.to_csv(self.chap_appearance_fp, index = False) | |
char_details_df.to_csv(self.char_details_fp, index = False) | |
df_age_bounty.to_csv(self.age_bounty_fp, index = False) | |
if __name__ == '__main__': | |
cleaner = cleaner() | |
cleaner.preprocess_data() |