Spaces:
Sleeping
Sleeping
File size: 3,553 Bytes
cb22296 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import re
import numpy as np
from src.arcs import generate_arc
import warnings
import pandas as pd
from configparser import ConfigParser, ExtendedInterpolation
warnings.filterwarnings("ignore")
def get_last_known_bounty(row):
"""get latest bounty for each character row
"""
if type(row) == float:
return row
elif type(row) == str:
x = re.sub(r"\[.*?\]", " ", row)
x = x.split(" ")
ret = ''.join([n for n in x[0] if n.isdigit()])
if len(ret) ==0:
return np.nan
return int(ret)
def get_latest_age(row):
if type(row) == str:
x = re.sub(r"\[.*?\]", " ", row)
x = re.sub(r"\(.*?\)", " ", x)
x = x.replace(";", "")
x = x.split(" ")
ret = ' '.join([n for n in x if n.isdigit()])
ret = ret.split(" ")
newret = []
for i in ret:
try:
newret.append(int(i))
except:
newret.append(i)
return (max(newret))
def get_main_crew(row):
if type(row) == str:
x = re.sub(r"\[.*?\]", " ", row)
x = re.sub(r"\(.*?\)", " ", x)
x = x.split(";")
# x = x.split("")
return x[0]
class cleaner:
def __init__(self, config_path = 'cfg/cfg.ini'):
pl_config = ConfigParser(interpolation=ExtendedInterpolation())
pl_config.read(config_path)
self.end_chap = pl_config['SCRAPER'].getint('end_chap') + 1
self.char_link_fp = pl_config['SCRAPER'].get('char_link_fp')
self.chap_appearance_fp = pl_config['SCRAPER'].get('chap_appearance_fp')
self.char_details_fp = pl_config['SCRAPER'].get('char_details_fp')
self.age_bounty_fp = pl_config['SCRAPER'].get('age_bounty_fp')
self.arcs = generate_arc(self.end_chap)
def arc_col(self,row):
"""function to generate arc per row for appearance df
"""
for key in self.arcs:
if row['Chapter'] in self.arcs[key]:
return key
return "None"
def preprocess_data(self):
# preprocess to add arc
appearance_df = pd.read_csv(self.chap_appearance_fp)
# appearance_df['Chapter'] = appearance_df['Chapter'].ffill()
# df['Arc Name'] = df['Arc Name'].ffill()
appearance_df['Appearance'] = appearance_df['Character'].str.split("(",expand=True)[0]
appearance_df['Appearance Notes'] = appearance_df['Character'].str.split("(",expand=True)[1]
appearance_df['Appearance Notes'] = appearance_df['Appearance Notes'].str.replace(")", "", regex = True)
appearance_df['Arc'] = appearance_df.apply(self.arc_col, axis =1)
char_details_df = pd.read_csv(self.char_details_fp)
char_details_df['last_bounty'] = char_details_df['bounty'].apply(get_last_known_bounty)
char_details_df['latest_age'] = char_details_df['age'].apply(get_latest_age)
char_details_df['latest_age']= char_details_df['latest_age'].fillna(value=np.nan)
char_details_df['main_crew'] = char_details_df['affiliation'].apply(get_main_crew)
df_age_bounty = char_details_df.dropna(subset=['latest_age', 'last_bounty'])
df_age_bounty['latest_age'] = df_age_bounty['latest_age'].astype('int')
appearance_df.to_csv(self.chap_appearance_fp, index = False)
char_details_df.to_csv(self.char_details_fp, index = False)
df_age_bounty.to_csv(self.age_bounty_fp, index = False)
if __name__ == '__main__':
cleaner = cleaner()
cleaner.preprocess_data() |