Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
def scrape_char_links(char_dict, start_chap = 1, end_chap =5000, continue_last = True): | |
# if continue_last: | |
# curr_chapts = df['Chapter'].tolist() | |
# else: curr_chapts = [] | |
for i in range(start_chap, end_chap): | |
# if i in curr_chapts: | |
# continue | |
# else: | |
if i % 100 == 0: | |
print (i) | |
# char_list = [] | |
URL = f'https://onepiece.fandom.com/wiki/Chapter_{i}' | |
page = requests.get(URL) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
table = soup.find('table', class_='CharTable') | |
for elem in table.find_all('li'): | |
try: | |
# char_list.append(elem.text) | |
if elem.find('a').get('title') in char_dict: | |
continue | |
else: | |
char_dict[elem.find('a').get('title')] = elem.find('a').get('href') | |
except : | |
continue | |
return char_dict |