Niv Sardi commited on
Commit
60ec487
1 Parent(s): d6dde3c

make Entity a NamedTuple

Browse files

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

python/common/defaults.py ADDED
@@ -0,0 +1 @@
 
 
1
+ DATA_PATH='./data'
python/entity.py CHANGED
@@ -1,16 +1,23 @@
1
  #!/usr/bin/env python3
2
-
3
- class Entity():
4
- _DATA_PATH = './data'
5
- def __init__(self, name, bco, url=None, logo=None):
6
- self.name = name
7
- self.bco = bco
8
- self.url = url
9
- self.logo = logo
 
 
 
 
 
 
 
10
 
11
  def __repr__(self):
12
  return f"""
13
- Entity:
14
  name: {self.name}
15
  bco: {self.bco}
16
  url: {self.url}
@@ -22,26 +29,22 @@ Entity:
22
  self = apply(cls, l)
23
  return self
24
 
 
25
  @classmethod
26
  def from_dict(cls, d):
27
- self = cls(None, None)
28
-
29
- for f in d.keys():
30
- setattr(self, f, d[f])
31
  return self
32
 
33
  @classmethod
34
  def row_names(cls):
35
- return ['name', 'bco', 'url', 'logo']
36
-
37
- @property
38
- def DATA_PATH(self):
39
- return self._DATA_PATH
40
 
41
  def to_row(self):
42
- return [self.name, self.bco, self.url, self.logo]
43
 
44
  if __name__ == '__main__':
45
- e = Entity.from_dict({'url': 'blah'})
46
  assert(e.url == 'blah')
47
  print(e)
 
1
  #!/usr/bin/env python3
2
+ import csv
3
+ from typing import NamedTuple
4
+
5
+ def read_entities(fn):
6
+ with open('./data/entidades.csv', newline='') as csvfile:
7
+ reader = csv.DictReader(csvfile)
8
+ bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
9
+ return bcos
10
+
11
+ class Entity(NamedTuple):
12
+ name: str
13
+ id: int = 0
14
+ bco: str = "debug"
15
+ url: str = None
16
+ logo: str = None
17
 
18
  def __repr__(self):
19
  return f"""
20
+ Entity {self.id}:
21
  name: {self.name}
22
  bco: {self.bco}
23
  url: {self.url}
 
29
  self = apply(cls, l)
30
  return self
31
 
32
+ # this now looks horrible…
33
  @classmethod
34
  def from_dict(cls, d):
35
+ o = {'name': None, 'id': 0, 'bco': None, 'url': None, 'logo': None}
36
+ o.update(d)
37
+ self = cls(o['name'], o['id'], o['bco'], o['url'], o['logo'])
 
38
  return self
39
 
40
  @classmethod
41
  def row_names(cls):
42
+ return ['id', 'name', 'bco', 'url', 'logo']
 
 
 
 
43
 
44
  def to_row(self):
45
+ return [self.id, self.name, self.bco, self.url, self.logo]
46
 
47
  if __name__ == '__main__':
48
+ e = Entity.from_dict({'name': 'test', 'url': 'blah'})
49
  assert(e.url == 'blah')
50
  print(e)
python/main.py CHANGED
@@ -8,34 +8,32 @@ from progress.bar import ChargingBar
8
 
9
  from entity import Entity
10
  from common import selectors
 
11
 
12
- pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
13
 
14
  DATA_FILE = './data/entidades.csv'
15
- URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
16
  page = requests.get(URL)
17
- soup = BeautifulSoup(page.content, "html.parser")
18
 
19
- options = soup.find(class_="form-control").find_all('option')
20
- with open(f"{DATA_FILE}.tmp", 'w', newline='') as csvfile:
21
  writer = csv.writer(csvfile)
22
  writer.writerow(Entity.row_names())
23
 
 
24
  bar = ChargingBar('Processing', max=len(options))
25
  for o in options[1:]:
26
- e = Entity(
27
- name = o.text,
28
- bco = o.attrs['value']
29
- )
30
- page = requests.post(URL, data={'bco': e.bco})
31
- soup = BeautifulSoup(page.content, "html.parser")
32
  try:
33
  img = soup.select_one(selectors.logosbancos).attrs['src']
34
- img = img.replace("../", "https://www.bcra.gob.ar/")
35
  except AttributeError as err:
36
- print('img', e.name, err)
37
  img = None
38
- e.logo = img
39
 
40
  a = soup.select_one(selectors.entity_http)
41
  try:
@@ -48,10 +46,11 @@ with open(f"{DATA_FILE}.tmp", 'w', newline='') as csvfile:
48
  except TypeError:
49
  print('ERROR', a)
50
 
51
- e.url = a
52
  writer.writerow(e.to_row())
 
53
  bar.next()
54
  bar.finish()
55
 
56
- shutil.move(f"{DATA_FILE}.tmp", DATA_FILE)
57
- print("scrape finished")
 
8
 
9
  from entity import Entity
10
  from common import selectors
11
+ from common import defaults
12
 
13
+ pathlib.Path(f'{defaults.DATA_PATH}/logos').mkdir(parents=True, exist_ok=True)
14
 
15
  DATA_FILE = './data/entidades.csv'
16
+ URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
17
  page = requests.get(URL)
18
+ soup = BeautifulSoup(page.content, 'html.parser')
19
 
20
+ options = soup.find(class_='form-control').find_all('option')
21
+ with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
22
  writer = csv.writer(csvfile)
23
  writer.writerow(Entity.row_names())
24
 
25
+ i = 0
26
  bar = ChargingBar('Processing', max=len(options))
27
  for o in options[1:]:
28
+ (name, bco)= (o.text, o.attrs['value'])
29
+ page = requests.post(URL, data={'bco': bco})
30
+ soup = BeautifulSoup(page.content, 'html.parser')
 
 
 
31
  try:
32
  img = soup.select_one(selectors.logosbancos).attrs['src']
33
+ img = img.replace('../', 'https://www.bcra.gob.ar/')
34
  except AttributeError as err:
35
+ print('img', name, err)
36
  img = None
 
37
 
38
  a = soup.select_one(selectors.entity_http)
39
  try:
 
46
  except TypeError:
47
  print('ERROR', a)
48
 
49
+ e = Entity(name, id=i, bco=bco, logo=img, url=a)
50
  writer.writerow(e.to_row())
51
+ i+=1
52
  bar.next()
53
  bar.finish()
54
 
55
+ shutil.move(f'{DATA_FILE}.tmp', DATA_FILE)
56
+ print('scrape finished')
python/screenshot.py CHANGED
@@ -9,6 +9,7 @@ from selenium.webdriver.common.by import By
9
 
10
  from common import selectors
11
  from entity import Entity
 
12
 
13
  options = webdriver.FirefoxOptions()
14
  options.add_argument("--headless")
@@ -24,13 +25,13 @@ def sc_entity(e: Entity):
24
  print(e)
25
  driver.implicitly_wait(10)
26
  driver.get(e.url)
27
- driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
28
- driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
29
 
30
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
31
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
32
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
33
- with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
34
  for i in logos:
35
  f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
36
 
 
9
 
10
  from common import selectors
11
  from entity import Entity
12
+ from common import defaults
13
 
14
  options = webdriver.FirefoxOptions()
15
  options.add_argument("--headless")
 
25
  print(e)
26
  driver.implicitly_wait(10)
27
  driver.get(e.url)
28
+ driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
29
+ driver.save_full_page_screenshot(f"{defaults.DATA_PATH}/{e.bco}.full.png")
30
 
31
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
32
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
33
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
34
+ with open(f"{defaults.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
35
  for i in logos:
36
  f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
37
 
python/vendor.py CHANGED
@@ -1,7 +1,5 @@
1
  #!/usr/bin/env python3
2
  import pathlib
3
-
4
- import shutil
5
  import csv
6
  import concurrent.futures
7
  import requests
@@ -9,6 +7,7 @@ import requests
9
  from progress.bar import ChargingBar
10
 
11
  from entity import Entity
 
12
  import screenshot
13
  import web
14
 
@@ -31,6 +30,7 @@ def from_csv(fn):
31
  (cert, logos) = f.result()
32
  except Exception as exc:
33
  print('%r generated an exception: %s' % (url, exc))
 
34
  else:
35
  print(cert, logos)
36
  bar.next()
@@ -40,6 +40,6 @@ def from_csv(fn):
40
  #exit()
41
 
42
  if __name__ == '__main__':
43
- #pathlib.Path(e.DATA_PATH).mkdir(parents=True, exist_ok=True)
44
- pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
45
- from_csv(f"{Entity._DATA_PATH}/entidades.csv")
 
1
  #!/usr/bin/env python3
2
  import pathlib
 
 
3
  import csv
4
  import concurrent.futures
5
  import requests
 
7
  from progress.bar import ChargingBar
8
 
9
  from entity import Entity
10
+ from common import defaults
11
  import screenshot
12
  import web
13
 
 
30
  (cert, logos) = f.result()
31
  except Exception as exc:
32
  print('%r generated an exception: %s' % (url, exc))
33
+ raise
34
  else:
35
  print(cert, logos)
36
  bar.next()
 
40
  #exit()
41
 
42
  if __name__ == '__main__':
43
+ #pathlib.Path(defaults.DATA_PATH).mkdir(parents=True, exist_ok=True)
44
+ pathlib.Path(f"{defaults.DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
45
+ from_csv(f"{defaults.DATA_PATH}/entidades.csv")
python/web.py CHANGED
@@ -1,22 +1,25 @@
1
  #!/usr/bin/env python3
2
  import ssl
 
 
3
  from bs4 import BeautifulSoup
4
 
5
  from entity import Entity
6
- from common import selectors
 
7
  def get_page(e: Entity):
8
  try:
9
  page = requests.get(e.url)
10
  except Exception:
11
- e.url = e.url.replace('http', 'https')
12
- page = requests.get(e.url)
13
  return page
14
 
15
  def get_cert(e: Entity):
16
  ssl_url = e.url.split("/")[2]
17
  try:
18
  cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
19
- fn = f"{e.DATA_PATH}/{e.bco}.cert"
20
  with open(fn, 'w') as f:
21
  f.write(cert)
22
  except Exception as err:
@@ -24,23 +27,27 @@ def get_cert(e: Entity):
24
  f.write(str(err))
25
  return fn
26
 
 
 
 
 
 
 
 
 
 
27
  def get_logos(e: Entity, page):
28
  soup = BeautifulSoup(page.content, "html.parser")
29
- logos = soup.select(selectors.logo)
 
 
30
 
31
  i = 0
32
  lfn = []
33
  for l in logos:
34
- src = l.attrs['src']
35
- ext = src.split('.')[-1].split('/')[-1]
36
- try:
37
- res = requests.get(src, stream=True)
38
- except Exception:
39
- res = requests.get(f"{e.url}/{src}")
40
-
41
- fn = f"{e.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
42
- with open(fn, "wb") as f:
43
- shutil.copyfileobj(res.raw, f)
44
- lfn.append(fn)
45
  i+=1
46
  return lfn
 
1
  #!/usr/bin/env python3
2
  import ssl
3
+ import shutil
4
+ import requests
5
  from bs4 import BeautifulSoup
6
 
7
  from entity import Entity
8
+ from common import selectors, defaults
9
+
10
  def get_page(e: Entity):
11
  try:
12
  page = requests.get(e.url)
13
  except Exception:
14
+ url = e.url.replace('http', 'https')
15
+ page = requests.get(url)
16
  return page
17
 
18
  def get_cert(e: Entity):
19
  ssl_url = e.url.split("/")[2]
20
  try:
21
  cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
22
+ fn = f"{defaults.DATA_PATH}/{e.bco}.cert"
23
  with open(fn, 'w') as f:
24
  f.write(cert)
25
  except Exception as err:
 
27
  f.write(str(err))
28
  return fn
29
 
30
+ def get_img_logo(src: string):
31
+ ext = src.split('.')[-1].split('/')[-1]
32
+ res = requests.get(src, stream=True)
33
+
34
+ fn = f"{defaults.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
35
+ with open(fn, "wb") as f:
36
+ shutil.copyfileobj(res.raw, f)
37
+ return fn
38
+
39
  def get_logos(e: Entity, page):
40
  soup = BeautifulSoup(page.content, "html.parser")
41
+ logos = soup.select(selectors.img_logo)
42
+ logos.extend(soup.select(selectors.id_logo))
43
+ logos.extend(soup.select(selectors.cls_logo))
44
 
45
  i = 0
46
  lfn = []
47
  for l in logos:
48
+ if 'src' in l.attrs:
49
+ src = l.attrs['src']
50
+ if not src.startswith('http'): src = e.url + src
51
+ lfn.append(get_img_logo(src))
 
 
 
 
 
 
 
52
  i+=1
53
  return lfn