|
|
|
import concurrent.futures |
|
import requests |
|
|
|
from progress.bar import ChargingBar |
|
|
|
from entity import Entity, read_entities |
|
from common import defaults,mkdir |
|
import web |
|
|
|
PARALLEL = 20 |
|
|
|
def do_screenshot(e: Entity): |
|
assert(e.url) |
|
sfn = requests.post('http://puppet:8000/screenshot', json={ |
|
'url': e.url, |
|
'id': e.id, |
|
'path': f'{defaults.SCREENSHOT_PATH}/{e.bco}.png', |
|
'logos': f'{defaults.LOGOS_DATA_PATH}/{e.bco}.png' |
|
}) |
|
|
|
def get_entity_logo(e: Entity): |
|
fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.0.png" |
|
web.get_img_logo(e.logo, fn) |
|
|
|
def from_csv(args): |
|
ACTIONS = [] |
|
if (args.certs): |
|
ACTIONS.append(web.get_cert) |
|
mkdir.make_dirs([defaults.CERTS_PATH]) |
|
if (args.logos): |
|
ACTIONS.append(web.get_logos) |
|
mkdir.make_dirs([defaults.LOGOS_DATA_PATH]) |
|
if (args.screenshots): |
|
ACTIONS.append(do_screenshot) |
|
mkdir.make_dirs([defaults.SCREENSHOT_PATH]) |
|
if (args.entity_logo): |
|
ACTIONS.append(get_entity_logo) |
|
mkdir.make_dirs([defaults.LOGOS_DATA_PATH]) |
|
|
|
print(ACTIONS) |
|
with concurrent.futures.ThreadPoolExecutor(max_workers = args.parallel) as executor: |
|
futures = {} |
|
entities = read_entities(args.csv) |
|
qs = len(entities.keys())*len(ACTIONS) |
|
bar = ChargingBar(f'vendor ({qs} jobs)', max=qs) |
|
|
|
for e in entities.values(): |
|
futures.update({executor.submit(f, e): (e, f) for f in ACTIONS}) |
|
print('waiting for futures') |
|
|
|
for f in concurrent.futures.as_completed(futures): |
|
(e, a) = futures[f] |
|
try: |
|
f.result() |
|
except Exception as err: |
|
print(f'{a}({e.url}) generated an exception: {err}') |
|
bar.next() |
|
bar.finish() |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
import argparse |
|
print("🌏 getting vendor data") |
|
parser = argparse.ArgumentParser(description='extract certificates and screenshots websites') |
|
parser.add_argument('--csv', metavar='csv', type=str, |
|
default=defaults.MAIN_CSV_PATH, |
|
help='main database') |
|
parser.add_argument('--parallel', metavar='parallel', type=int, |
|
default=PARALLEL, |
|
help='number of concurrent jobs') |
|
parser.add_argument('--logos', metavar='logos', type=bool, |
|
action=argparse.BooleanOptionalAction, |
|
default=True, help='try to get logos') |
|
parser.add_argument('--entity-logo', metavar='entity_logo', type=bool, |
|
action=argparse.BooleanOptionalAction, |
|
default=True, help='try to get logos form ENTITY') |
|
parser.add_argument('--certs', metavar='certs', type=bool, |
|
action=argparse.BooleanOptionalAction, |
|
default=True, help='try to get certs') |
|
parser.add_argument('--screenshots', metavar='screenshots', type=bool, |
|
action=argparse.BooleanOptionalAction, |
|
default=True, help='try to get screenshots') |
|
|
|
args = parser.parse_args() |
|
from_csv(args) |
|
|