File size: 2,542 Bytes
6cd90ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import urllib.request
import json
import urllib.parse
from urllib.parse import urlsplit, quote
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
# url = 'https://pokemon.fandom.com/ko/wiki/흥나숭_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/나몰빼미_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/도치마론_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/비크티니_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/모부기_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/나무지기_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/치코리타_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/토게틱_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/포푸니_(포켓몬)'
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/레트라_(포켓몬)'
# url = 'https://pokemon.fandom.com/ko/wiki/신비록_(포켓몬)'
url_info = urlsplit(url)
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
info = []
erros = []
target_number = 1017
cnt = 0
for _ in tqdm(range(target_number+2)):
cnt += 1
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
res = urlopen(req)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
name = soup.find("div", {"class": "name-ko"}).text.strip()
number = soup.find("div", {"class": "index"}).text.strip()
try:
img_url = soup.find("div", {"class":"image rounded"}).find("img")['data-src']
filepath = f"images/{number.replace('.', '_')}_{name}.png"
urllib.request.urlretrieve(img_url, filepath)
except:
filepath = None
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
info.append(dict(
name=name,
number=number,
types=types,
doc_text=doc_text,
image_path=filepath,
url=encoded_url
))
next_monster = soup.find("table").findAll("a")[-1]['href']
encoded_url = "https://pokemon.fandom.com" + next_monster
if number == f"No.{target_number:04d}":
break
if cnt >= target_number:
break
pd.DataFrame(info).to_csv('pokemon.csv', index=False)
with open('pokemon.json', 'w') as f:
json.dump(info, f, ensure_ascii=False, indent=4) |