File size: 1,885 Bytes
cb22296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from sqlite3 import Error


def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    conn = sqlite3.connect(db_file)
    if conn:
        conn.close()

def scrape_char_details(char_link_df, save_file_name):
    char_links = char_link_df['Link'].tolist()
    df = pd.DataFrame()    
    for char_link in char_links:
        try:
            URL = f'https://onepiece.fandom.com{char_link}'
            page = requests.get(URL)
            soup = BeautifulSoup(page.content, 'html.parser')
            table = soup.find('aside', {'role': 'region'} )
            
            name = table.find("h2", {"data-source": "name"}).text
            char_det_dict = {"Name": name}
            det_list = ['first','affiliation', 'occupation','residence', 'epithet','status', 'age', 'bounty', 'dfname']
            for det in det_list:
                if table.find("div", {"data-source": det}) is not None:
                    text_value = table.find("div", {"data-source": det}).find("div", {"class": "pi-data-value pi-font"}).text
                    if text_value is not None:
                        char_det_dict[det] = text_value 
                    else:
                        char_det_dict[det] = [i.get("title") for i in table.find("div", {"data-source": det}).find("div").find_all("a")]
            df = df.append(char_det_dict, ignore_index=True)
        except:
            print(f'Unable to process: {char_link}')
            continue
    df.to_csv(save_file_name, index=False)
    # print (char_det_dict)
        
if __name__ == '__main__':
    # dbname = r"data/OPdash.db"
    # create_connection(dbname)
    char_link_df = pd.read_csv('data/char_link.csv')
    scrape_char_details(char_link_df, save_file_name = "data/char_details.csv")