Niv Sardi commited on
Commit
f1ab0d5
1 Parent(s): dc89ab8

implement python inotify watcher and add to docker-compose

Browse files
Dockerfile.python ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM docker.io/jjanzic/docker-python3-opencv
2
+ MAINTAINER Niv Sardi <x@filtra.me>
3
+ WORKDIR /app
4
+
5
+ COPY crawler ./src
6
+ RUN pip install -r ./src/requirements.txt
7
+
8
+ CMD python3 ./src/watcher.py
crawler/entity.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python3
2
 
3
  class Entity():
 
4
  def __init__(self, name, bco, url=None, logo=None):
5
  self.name = name
6
  self.bco = bco
@@ -35,7 +36,7 @@ Entity:
35
 
36
  @property
37
  def DATA_PATH(self):
38
- return f"./data/{self.bco}"
39
 
40
  def to_row(self):
41
  return [self.name, self.bco, self.url, self.logo]
 
1
  #!/usr/bin/env python3
2
 
3
  class Entity():
4
+ _DATA_PATH = './data'
5
  def __init__(self, name, bco, url=None, logo=None):
6
  self.name = name
7
  self.bco = bco
 
36
 
37
  @property
38
  def DATA_PATH(self):
39
+ return self._DATA_PATH
40
 
41
  def to_row(self):
42
  return [self.name, self.bco, self.url, self.logo]
crawler/imtool.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import math
5
+ import cv2
6
+ import pathlib
7
+ from typing import NamedTuple
8
+
9
+ from entity import Entity
10
+
11
+ TILE_SIZE = 800
12
+ TILE_OVERLAP = 0.8
13
+
14
+ class BoundingBox(NamedTuple):
15
+ x: float = 0.0
16
+ y: float = 0.0
17
+ w: float = 0.0
18
+ h: float = 0.0
19
+
20
+ @classmethod
21
+ def from_centroid(cls, c):
22
+ x = math.floor(c.x + c.w/2)
23
+ y = math.floor(c.y + c.h/2)
24
+ self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
25
+ return self
26
+
27
+ @classmethod
28
+ def from_dict(cls, d):
29
+ self = cls(x=d['x'], y=d['y'], w=d['width'], h=d['height'])
30
+ return self
31
+
32
+ class Centroid(BoundingBox):
33
+ @classmethod
34
+ def from_bounding_box(cls, b):
35
+ x = math.floor(b.x - c.w/2)
36
+ y = math.floor(b.y - c.h/2)
37
+ self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
38
+
39
+ def read_bounding_boxes(filename):
40
+ boxes = []
41
+ with open(filename, 'r') as f:
42
+ (x,y,w,h) = [float(i) for i in f.readline().split(' ')[1:]]
43
+ boxes.append(BoundingBox(x,y,w,h))
44
+ return boxes
45
+
46
+ def floor_point(a, b):
47
+ return (math.floor(a), math.floor(b))
48
+
49
+ def cut_img(im, s, e):
50
+ x = s[0]
51
+ y = s[1]
52
+ w = e[0] - x
53
+ h = e[1] - y
54
+
55
+ print("DEBUG", im.shape, x, y, w, h)
56
+ return im[y:h, x:w]
57
+
58
+ def cut_logo(im, l):
59
+ (x, y, w, h) = floor_logo(l)
60
+ return im[x:w, y:h]
61
+
62
+ def crop(fn, logos):
63
+ basename = os.path.basename(fn).replace('.png', '')
64
+ out = f"./data/squares"
65
+ pathlib.Path(out).mkdir(parents=True, exist_ok=True)
66
+
67
+ im = cv2.imread(fn)
68
+
69
+ (h, w, c) = im.shape
70
+ (tx, ty)= (
71
+ math.ceil(w/(TILE_SIZE*TILE_OVERLAP)),
72
+ math.ceil(h/(TILE_SIZE*TILE_OVERLAP))
73
+ )
74
+
75
+ print('shape', basename, tx, ty, h, w, logos)
76
+ for x in range(tx):
77
+ for y in range(ty):
78
+ color = (0,x*(255/tx),y*(255/ty))
79
+
80
+ fx = math.floor(x*(w - TILE_SIZE)/(tx))
81
+ fy = math.floor(y*(h - TILE_SIZE)/(ty))
82
+
83
+ start = (fx, fy)
84
+ end = (fx + TILE_SIZE, fy + TILE_SIZE)
85
+
86
+ #im = cv2.rectangle(im, start, end, color, 10)
87
+ li = []
88
+ for l in logos:
89
+ def intersect():
90
+ six = l.x - fx
91
+ siy = l.y - fy
92
+ eix = six + l.w
93
+ eiy = siy + l.h
94
+
95
+ if six < 0:
96
+ if six + l.w < 0:
97
+ return None
98
+ six = 0
99
+ if siy < 0:
100
+ if siy + l.h < 0:
101
+ return None
102
+ siy = 0
103
+ if eix > TILE_SIZE:
104
+ if eix - l.w > TILE_SIZE:
105
+ return None
106
+ eix = TILE_SIZE
107
+ if eiy > TILE_SIZE:
108
+ if eiy - l.h > TILE_SIZE:
109
+ return None
110
+ eiy = TILE_SIZE
111
+
112
+ return (six, siy), (eix, eiy)
113
+
114
+ p = intersect()
115
+ if p:
116
+ li.append(p)
117
+
118
+ c = (255, 0, 0)
119
+ nim = im[fy:fy+TILE_SIZE, fx:fx+TILE_SIZE]
120
+ name =f"{out}/{basename}.{x}.{y}"
121
+ cv2.imwrite(f"{name}.png", nim)
122
+ if len(li):
123
+ with open(f"{name}.txt", 'w') as f:
124
+ for p in li:
125
+ cw = p[1][0] - p[0][0]
126
+ ch = p[1][1] - p[0][1]
127
+ cx = cw/2 + p[0][0]
128
+ cy = ch/2 + p[0][1]
129
+
130
+ a = f"{basename} {cx/TILE_SIZE} {cy/TILE_SIZE} {cw/TILE_SIZE} {ch/TILE_SIZE}"
131
+ f.write(a)
132
+ print(a)
133
+
134
+ if __name__ == '__main__':
135
+ boxes = read_bounding_boxes("./data/debug.full.txt")
136
+ print(boxes)
137
+ crop("./data/debug.full.png", boxes)
138
+
crawler/main.py CHANGED
@@ -1,5 +1,5 @@
1
  import csv
2
-
3
  import requests
4
  from bs4 import BeautifulSoup
5
  from progress.bar import ChargingBar
@@ -7,13 +7,14 @@ from progress.bar import ChargingBar
7
  from entity import Entity
8
  from common import selectors
9
 
 
 
10
  URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
11
  page = requests.get(URL)
12
-
13
  soup = BeautifulSoup(page.content, "html.parser")
14
 
15
  options = soup.find(class_="form-control").find_all('option')
16
- with open('entidades.csv', 'w', newline='') as csvfile:
17
  writer = csv.writer(csvfile)
18
  writer.writerow(Entity.row_names())
19
 
 
1
  import csv
2
+ import pathlib
3
  import requests
4
  from bs4 import BeautifulSoup
5
  from progress.bar import ChargingBar
 
7
  from entity import Entity
8
  from common import selectors
9
 
10
+ pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
11
+
12
  URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
13
  page = requests.get(URL)
 
14
  soup = BeautifulSoup(page.content, "html.parser")
15
 
16
  options = soup.find(class_="form-control").find_all('option')
17
+ with open('./data/entidades.csv', 'w', newline='') as csvfile:
18
  writer = csv.writer(csvfile)
19
  writer.writerow(Entity.row_names())
20
 
crawler/requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  bs4==0.0.1
2
  progress==1.6
3
- selenium==4.3.0
 
1
  bs4==0.0.1
2
  progress==1.6
3
+ inotify
crawler/screenshot.py CHANGED
@@ -1,5 +1,8 @@
1
  #!/usr/bin/env python3
2
  #
 
 
 
3
  from selenium import webdriver
4
  from selenium.webdriver.common.keys import Keys
5
  from selenium.webdriver.common.by import By
@@ -11,18 +14,22 @@ options = webdriver.FirefoxOptions()
11
  options.add_argument("--headless")
12
  options.add_argument("--window-size=1920x8000")
13
 
 
 
 
 
 
14
  driver = webdriver.Firefox(options=options)
15
  def sc_entity(e: Entity):
16
  print(e)
17
  driver.get(e.url)
18
- driver.save_screenshot(f"{e.DATA_PATH}/screenshot.png")
19
- driver.save_full_page_screenshot(f"{e.DATA_PATH}/screenshot.full.png")
20
 
21
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
22
- with open(f"{e.DATA_PATH}/logo.pos", 'w') as f:
23
  for i in logos:
24
- f.write(repr(i.rect))
25
- print(i.get_attribute('src'), i.rect)
26
 
27
  if __name__ == '__main__':
28
  sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
 
1
  #!/usr/bin/env python3
2
  #
3
+
4
+ import math
5
+
6
  from selenium import webdriver
7
  from selenium.webdriver.common.keys import Keys
8
  from selenium.webdriver.common.by import By
 
14
  options.add_argument("--headless")
15
  options.add_argument("--window-size=1920x8000")
16
 
17
+ def coord_to_point(c):
18
+ x = math.floor(c['x'] + c['width']/2)
19
+ y = math.floor(c['y'] + c['height']/2)
20
+ return f"{x} {y} {math.roof(c['width'])} {math.roof(c['height'])}"
21
+
22
  driver = webdriver.Firefox(options=options)
23
  def sc_entity(e: Entity):
24
  print(e)
25
  driver.get(e.url)
26
+ driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
27
+ driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
28
 
29
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
30
+ with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
31
  for i in logos:
32
+ f.write(f"{e.bco} {coord_to_point(i.rect)}")
 
33
 
34
  if __name__ == '__main__':
35
  sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
crawler/vendor.py CHANGED
@@ -1,58 +1,21 @@
1
  #!/usr/bin/env python3
2
  import pathlib
3
- import ssl
4
  import shutil
5
  import csv
6
  import concurrent.futures
7
  import requests
8
- from bs4 import BeautifulSoup
9
  from progress.bar import ChargingBar
10
 
11
  from entity import Entity
12
- from common import selectors
13
  import screenshot
14
-
15
- def write_cert(e: Entity):
16
- ssl_url = e.url.split("/")[2]
17
- try:
18
- cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
19
- with open(f"{e.DATA_PATH}/cert", 'w') as f:
20
- f.write(cert)
21
- except Exception as err:
22
- with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
23
- f.write(str(err))
24
-
25
- def get_logos(e: Entity, page):
26
- soup = BeautifulSoup(page.content, "html.parser")
27
- logos = soup.select(selectors.logo)
28
-
29
- i = 0
30
- lfn = []
31
- for l in logos:
32
- src = l.attrs['src']
33
- ext = src.split('.')[-1].split('/')[-1]
34
- try:
35
- res = requests.get(src, stream=True)
36
- except Exception:
37
- res = requests.get(f"{e.url}/{src}")
38
-
39
- fn = f"{e.DATA_PATH}/{i}.{ext}"
40
- with open(fn, "wb") as f:
41
- shutil.copyfileobj(res.raw, f)
42
- lfn.append(fn)
43
- i+=1
44
 
45
  def query_vendor_site(e: Entity):
46
- pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
47
-
48
- try:
49
- page = requests.get(e.url)
50
- except Exception:
51
- e.url = e.url.replace('http', 'https')
52
- page = requests.get(e.url)
53
-
54
- write_cert(e)
55
- get_logos(e, page)
56
  screenshot.sc_entity(e)
57
  return (fn, lfn)
58
 
@@ -73,8 +36,10 @@ def from_csv(fn):
73
  bar.next()
74
  bar.finish()
75
 
76
- #query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
77
  #exit()
78
 
79
  if __name__ == '__main__':
80
- from_csv('entidades.csv')
 
 
 
1
  #!/usr/bin/env python3
2
  import pathlib
3
+
4
  import shutil
5
  import csv
6
  import concurrent.futures
7
  import requests
8
+
9
  from progress.bar import ChargingBar
10
 
11
  from entity import Entity
 
12
  import screenshot
13
+ import web
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def query_vendor_site(e: Entity):
16
+ page = web.get_page(e)
17
+ fn = web.get_cert(e)
18
+ lfn = web.get_logos(e, page)
 
 
 
 
 
 
 
19
  screenshot.sc_entity(e)
20
  return (fn, lfn)
21
 
 
36
  bar.next()
37
  bar.finish()
38
 
39
+ #query_vendor_site(Entity.from_dict({'url':'http://www.bancoprovincia.com.ar', 'bco':'debug'}))
40
  #exit()
41
 
42
  if __name__ == '__main__':
43
+ #pathlib.Path(e.DATA_PATH).mkdir(parents=True, exist_ok=True)
44
+ pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
45
+ from_csv(f"{Entity._DATA_PATH}/entidades.csv")
crawler/watcher.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inotify.adapters
3
+ from imtool import read_bounding_boxes, crop
4
+
5
+ def watch(dir):
6
+ i = inotify.adapters.Inotify()
7
+ i.add_watch(dir)
8
+ for event in i.event_gen(yield_nones=False):
9
+ (_, type_names, path, filename) = event
10
+
11
+ if filename.endswith(".png") and type_names[0] in ['IN_CLOSE_WRITE']:
12
+ print(f"--PATH=[{path}] FILENAME=[{filename}] EVENT_TYPES={type_names}")
13
+ try:
14
+ bbs = read_bounding_boxes(os.path.join(path, filename.replace('.png', '.txt')))
15
+ crop(os.path.join(path, filename), bbs)
16
+ except Exception as e:
17
+ print(f"error: {e}")
18
+
19
+ if __name__ == '__main__':
20
+ watch('./data')
crawler/web.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import ssl
3
+ from bs4 import BeautifulSoup
4
+
5
+ from entity import Entity
6
+ from common import selectors
7
+ def get_page(e: Entity):
8
+ try:
9
+ page = requests.get(e.url)
10
+ except Exception:
11
+ e.url = e.url.replace('http', 'https')
12
+ page = requests.get(e.url)
13
+ return page
14
+
15
+ def get_cert(e: Entity):
16
+ ssl_url = e.url.split("/")[2]
17
+ try:
18
+ cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
19
+ fn = f"{e.DATA_PATH}/{e.bco}.cert"
20
+ with open(fn, 'w') as f:
21
+ f.write(cert)
22
+ except Exception as err:
23
+ with open(f"{fn}.error.log", 'w+') as f:
24
+ f.write(str(err))
25
+ return fn
26
+
27
+ def get_logos(e: Entity, page):
28
+ soup = BeautifulSoup(page.content, "html.parser")
29
+ logos = soup.select(selectors.logo)
30
+
31
+ i = 0
32
+ lfn = []
33
+ for l in logos:
34
+ src = l.attrs['src']
35
+ ext = src.split('.')[-1].split('/')[-1]
36
+ try:
37
+ res = requests.get(src, stream=True)
38
+ except Exception:
39
+ res = requests.get(f"{e.url}/{src}")
40
+
41
+ fn = f"{e.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
42
+ with open(fn, "wb") as f:
43
+ shutil.copyfileobj(res.raw, f)
44
+ lfn.append(fn)
45
+ i+=1
46
+ return lfn
docker-compose.yaml CHANGED
@@ -12,9 +12,10 @@ services:
12
  DEBUG: "puppet"
13
  depends_on:
14
  - "browserless"
15
- command: "sh -c 'while echo deno; do sleep 3h; done'" #"deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
 
16
  volumes:
17
- - "./src-deno:/app/src:z"
18
  - "./data:/app/data:z"
19
  #restart: unless-stopped:600
20
  deploy:
@@ -22,7 +23,15 @@ services:
22
  condition: any
23
  delay: 600s
24
  window: 300s
25
-
 
 
 
 
 
 
 
 
26
  browserless:
27
  image: docker.io/zenika/alpine-chrome
28
  entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
 
12
  DEBUG: "puppet"
13
  depends_on:
14
  - "browserless"
15
+ # command: "sh -c 'while echo deno; do sleep 3h; done'" #
16
+ command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
17
  volumes:
18
+ # - "./src:/app/src:z" # for debugging
19
  - "./data:/app/data:z"
20
  #restart: unless-stopped:600
21
  deploy:
 
23
  condition: any
24
  delay: 600s
25
  window: 300s
26
+ cutter:
27
+ build:
28
+ dockerfile: Dockerfile.python
29
+ context: .
30
+ depends_on:
31
+ - "puppet"
32
+ volumes:
33
+ # - "./crawler:/app/src:z" # for debugging
34
+ - "./data:/app/data:z"
35
  browserless:
36
  image: docker.io/zenika/alpine-chrome
37
  entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]