File size: 3,553 Bytes
cb22296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
import numpy as np
from src.arcs import generate_arc
import warnings
import pandas as pd
from configparser import ConfigParser, ExtendedInterpolation

warnings.filterwarnings("ignore")

def get_last_known_bounty(row):
    """get latest bounty for each character row
    """
    if type(row) == float:
        return row
    elif type(row) == str:
        x = re.sub(r"\[.*?\]", " ", row)
        x = x.split(" ")
        ret = ''.join([n for n in x[0] if n.isdigit()])
        if len(ret) ==0:
            return np.nan
        return int(ret)

def get_latest_age(row):
    if type(row) == str:
        x = re.sub(r"\[.*?\]", " ", row)
        x = re.sub(r"\(.*?\)", " ", x)
        x = x.replace(";", "")
        x = x.split(" ")
        
        ret = ' '.join([n for n in x if n.isdigit()])
        ret = ret.split(" ")
        newret = []
        for i in ret:
            try:
                newret.append(int(i))
            except:
                newret.append(i)

        return (max(newret))

def get_main_crew(row):
    if type(row) == str:
        x = re.sub(r"\[.*?\]", " ", row)
        x = re.sub(r"\(.*?\)", " ", x)
        x = x.split(";")
        # x = x.split("")
        return x[0]

class cleaner:
    def __init__(self, config_path = 'cfg/cfg.ini'):

        pl_config = ConfigParser(interpolation=ExtendedInterpolation())
        pl_config.read(config_path)

        self.end_chap = pl_config['SCRAPER'].getint('end_chap') + 1
        self.char_link_fp = pl_config['SCRAPER'].get('char_link_fp')
        self.chap_appearance_fp = pl_config['SCRAPER'].get('chap_appearance_fp')
        self.char_details_fp = pl_config['SCRAPER'].get('char_details_fp')
        self.age_bounty_fp = pl_config['SCRAPER'].get('age_bounty_fp')
        self.arcs = generate_arc(self.end_chap)
    
    def arc_col(self,row):
        """function to generate arc per row for appearance df
        """
        for key in self.arcs:
            if row['Chapter'] in self.arcs[key]:
                return key
        return "None"
    
    def preprocess_data(self):
        # preprocess to add arc 
        appearance_df = pd.read_csv(self.chap_appearance_fp)
        # appearance_df['Chapter'] = appearance_df['Chapter'].ffill()
        # df['Arc Name'] = df['Arc Name'].ffill()
        
        appearance_df['Appearance'] = appearance_df['Character'].str.split("(",expand=True)[0]
        appearance_df['Appearance Notes'] = appearance_df['Character'].str.split("(",expand=True)[1]
        appearance_df['Appearance Notes'] = appearance_df['Appearance Notes'].str.replace(")", "", regex = True)
        appearance_df['Arc'] = appearance_df.apply(self.arc_col, axis =1) 

        char_details_df = pd.read_csv(self.char_details_fp)
        char_details_df['last_bounty'] = char_details_df['bounty'].apply(get_last_known_bounty)
        char_details_df['latest_age'] = char_details_df['age'].apply(get_latest_age)
        char_details_df['latest_age']= char_details_df['latest_age'].fillna(value=np.nan)
        char_details_df['main_crew'] = char_details_df['affiliation'].apply(get_main_crew)
        df_age_bounty = char_details_df.dropna(subset=['latest_age', 'last_bounty'])
        df_age_bounty['latest_age'] = df_age_bounty['latest_age'].astype('int')

        appearance_df.to_csv(self.chap_appearance_fp, index = False)
        char_details_df.to_csv(self.char_details_fp, index = False)
        df_age_bounty.to_csv(self.age_bounty_fp, index = False)

if __name__ == '__main__':
    cleaner = cleaner()
    cleaner.preprocess_data()