Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
import csv | |
import requests | |
import shutil | |
import re | |
from bs4 import BeautifulSoup | |
from progress.bar import ChargingBar | |
import concurrent.futures | |
import web | |
from entity import Entity | |
from common import selectors, defaults, mkdir | |
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp' | |
page = requests.get(URL) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
options = soup.find(class_='form-control').find_all('option') | |
mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH]) | |
def get_links(soup): | |
for l in soup.select('.post-pagina-interior'): | |
for a in l.select('a'): | |
if 'href' in a.attrs and a.attrs['href'].startswith('http'): | |
return a.attrs['href'] | |
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(Entity.row_names()) | |
bar = ChargingBar('get entities', max=len(options)) | |
def get_bco(o, i): | |
(name, bco)= (o.text, o.attrs['value']) | |
page = requests.post(URL, data={'bco': bco}, stream=False) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
img = f'https://www.bcra.gob.ar/Imagenes/logosbancos/{bco}.jpg' | |
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(get_links(soup))) | |
writer.writerow(e.to_row()) | |
i+=1 | |
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor: | |
futures = {executor.submit(get_bco, o, i): o for (i, o) in enumerate(options[1:])} | |
for f in concurrent.futures.as_completed(futures): | |
o = futures[f] | |
try: | |
f.result() | |
except Exception as err: | |
print(f'({o}) generated an exception: {err}') | |
bar.next() | |
bar.finish() | |
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH) | |
print(f'scrape finished, found {len(options[1:])} entities, dumped to {defaults.MAIN_CSV_PATH}') | |