Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import sqlite3 | |
from sqlite3 import Error | |
def create_connection(db_file): | |
""" create a database connection to a SQLite database """ | |
conn = None | |
conn = sqlite3.connect(db_file) | |
if conn: | |
conn.close() | |
def scrape_char_details(char_link_df, save_file_name): | |
char_links = char_link_df['Link'].tolist() | |
df = pd.DataFrame() | |
for char_link in char_links: | |
try: | |
URL = f'https://onepiece.fandom.com{char_link}' | |
page = requests.get(URL) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
table = soup.find('aside', {'role': 'region'} ) | |
name = table.find("h2", {"data-source": "name"}).text | |
char_det_dict = {"Name": name} | |
det_list = ['first','affiliation', 'occupation','residence', 'epithet','status', 'age', 'bounty', 'dfname'] | |
for det in det_list: | |
if table.find("div", {"data-source": det}) is not None: | |
text_value = table.find("div", {"data-source": det}).find("div", {"class": "pi-data-value pi-font"}).text | |
if text_value is not None: | |
char_det_dict[det] = text_value | |
else: | |
char_det_dict[det] = [i.get("title") for i in table.find("div", {"data-source": det}).find("div").find_all("a")] | |
df = df.append(char_det_dict, ignore_index=True) | |
except: | |
print(f'Unable to process: {char_link}') | |
continue | |
df.to_csv(save_file_name, index=False) | |
# print (char_det_dict) | |
if __name__ == '__main__': | |
# dbname = r"data/OPdash.db" | |
# create_connection(dbname) | |
char_link_df = pd.read_csv('data/char_link.csv') | |
scrape_char_details(char_link_df, save_file_name = "data/char_details.csv") | |