Niv Sardi commited on
Commit
e919aa3
β€’
1 Parent(s): acbdf2a

move from complicated multi-container to simpler design with a shell script

Browse files
Dockerfile.python CHANGED
@@ -2,8 +2,12 @@ FROM docker.io/python:3-slim-buster
2
  MAINTAINER Niv Sardi <x@filtra.me>
3
  WORKDIR /app
4
 
5
- COPY python ./src
6
- RUN pip install -r ./src/requirements.txt
7
- RUN apt update && apt install -y libglib2.0-0 libgl1 && rm -rf /var/cache/apt
8
 
9
- CMD python3 ./src/watcher.py
 
 
 
 
 
2
  MAINTAINER Niv Sardi <x@filtra.me>
3
  WORKDIR /app
4
 
5
+ RUN apt update && apt install -y libcairo2 libglib2.0-0 libgl1 && rm -rf /var/cache/apt
6
+ COPY python/requirements.txt ./python/requirements.txt
7
+ RUN pip install -r ./python/requirements.txt
8
 
9
+ COPY run.sh ./run
10
+ RUN chmod +x run
11
+ COPY python ./python
12
+
13
+ CMD env PATH=$PATH:/usr/local/bin python3 ./python/watcher.py
README.org CHANGED
@@ -18,7 +18,8 @@ The process is pretty simple:
18
  * running
19
  #+begin_src sh
20
  # build the training dataset
21
- docker-compose up --build --remove-orphans
 
22
 
23
  # run the training on your machine or collab
24
  # https://colab.research.google.com/drive/10R7uwVJJ1R1k6oTjbkkhxPDka7COK-WE
 
18
  * running
19
  #+begin_src sh
20
  # build the training dataset
21
+ docker-compose up --build --remove-orphans -d
22
+ docker-compose exec python ./run
23
 
24
  # run the training on your machine or collab
25
  # https://colab.research.google.com/drive/10R7uwVJJ1R1k6oTjbkkhxPDka7COK-WE
docker-compose.yaml CHANGED
@@ -1,50 +1,75 @@
1
  version: "3.9" # optional since v1.27.0
2
  services:
3
- crawler:
4
  build:
5
  dockerfile: Dockerfile.python
6
  context: .
7
- command: "python3 src/main.py"
8
- volumes:
9
- - "./data:/app/data:z"
10
-
11
- puppet:
12
- build:
13
- dockerfile: Dockerfile.deno
14
- context: .
15
- links:
16
- - browserless
17
  environment:
18
- BROWSERLESS_HOST: browserless
19
- BROWSERLESS_PORT: 3000
20
- DEBUG: "puppet"
21
- depends_on:
22
- - "browserless"
23
- # command: "sh -c 'while echo deno; do sleep 3h; done'" # debug
24
- command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
25
- volumes:
26
- - "./deno:/app/src:z" # for debugging
27
- - "./data:/app/data:z"
28
- #restart: unless-stopped:600
29
- deploy:
30
- restart_policy:
31
- condition: any
32
- delay: 600s
33
- window: 300s
34
-
35
- cutter:
36
- build:
37
- dockerfile: Dockerfile.python
38
- context: .
39
  depends_on:
40
- - "puppet"
 
 
41
  volumes:
42
- - "./python:/app/src:z" # for debugging
43
  - "./data:/app/data:z"
44
 
45
- browserless:
46
- image: docker.io/zenika/alpine-chrome
47
- entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
48
- ports:
49
- - "3000:3000"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
1
  version: "3.9" # optional since v1.27.0
2
  services:
3
+ python:
4
  build:
5
  dockerfile: Dockerfile.python
6
  context: .
 
 
 
 
 
 
 
 
 
 
7
  environment:
8
+ GECKO_HOST: geckodriver
9
+ GECKO_PORT: 4444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  depends_on:
11
+ - "geckodriver"
12
+ links:
13
+ - "geckodriver"
14
  volumes:
15
+ - "./python:/app/python:z" # for debugging
16
  - "./data:/app/data:z"
17
 
18
+ geckodriver:
19
+ image: docker.io/instrumentisto/geckodriver
20
+ entrypoint: ["sh", "-c", "while true; do geckodriver --binary=/opt/firefox/firefox --log warn --port 4444 --host 0.0.0.0; sleep 2; done"]
21
+ ports: # this is not required but nice for local debug
22
+ - "4444:4444"
23
+ # crawler:
24
+ # build:
25
+ # dockerfile: Dockerfile.python
26
+ # context: .
27
+ # command: "sh -c 'while true; do python3 src/get_entities.py; touch data/entities.csv; sleep 24h; done'"
28
+ # volumes:
29
+ # - "./data:/app/data:z"
30
+
31
+ # cutter:
32
+ # build:
33
+ # dockerfile: Dockerfile.python
34
+ # context: .
35
+ # environment:
36
+ # GECKO_HOST: geckodriver
37
+ # GECKO_PORT: 4444
38
+ # depends_on:
39
+ # - "geckodriver"
40
+ # links:
41
+ # - "geckodriver"
42
+ # volumes:
43
+ # - "./python:/app/src:z" # for debugging
44
+ # - "./data:/app/data:z"
45
+
46
+ # browserless:
47
+ # image: docker.io/zenika/alpine-chrome
48
+ # entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
49
+ # ports:
50
+ # - "3000:3000"
51
+
52
+ # puppet:
53
+ # build:
54
+ # dockerfile: Dockerfile.deno
55
+ # context: .
56
+ # links:
57
+ # - browserless
58
+ # environment:
59
+ # BROWSERLESS_HOST: browserless
60
+ # BROWSERLESS_PORT: 3000
61
+ # DEBUG: "puppet"
62
+ # depends_on:
63
+ # - "browserless"
64
+ # #command: "sh -c 'while echo deno; do sleep 3h; done'" # debug
65
+ # command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
66
+ # volumes:
67
+ # - "./deno:/app/src:z" # for debugging
68
+ # - "./data:/app/data:z"
69
+ # #restart: unless-stopped:600
70
+ # deploy:
71
+ # restart_policy:
72
+ # condition: any
73
+ # delay: 600s
74
+ # window: 300s
75
 
python/{main.py β†’ get_entities.py} RENAMED
@@ -1,3 +1,4 @@
 
1
  import csv
2
  import requests
3
  import shutil
@@ -14,43 +15,50 @@ page = requests.get(URL)
14
  soup = BeautifulSoup(page.content, 'html.parser')
15
 
16
  options = soup.find(class_='form-control').find_all('option')
17
- mkdir.make_dirs([defaults.DATA_PATH])
18
 
 
19
  with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
20
  writer = csv.writer(csvfile)
21
  writer.writerow(Entity.row_names())
22
 
23
- i = 0
24
  bar = ChargingBar('Processing', max=len(options))
25
  for o in options[1:]:
26
- (name, bco)= (o.text, o.attrs['value'])
27
- page = requests.post(URL, data={'bco': bco})
28
- soup = BeautifulSoup(page.content, 'html.parser')
29
- try:
30
- img = soup.select_one(selectors.logosbancos).attrs['src']
31
- img = img.replace('../', 'https://www.bcra.gob.ar/')
32
- fn = f"{defaults.LOGOS_DATA_PATH}/{bco}.0.png"
33
- web.get_img_logo(img, fn)
34
- except AttributeError as err:
35
- print('img', name, err)
36
- img = None
37
-
38
- a = soup.select_one(selectors.entity_http)
39
- try:
40
- a = a.attrs['href']
41
- except AttributeError:
42
- a = soup.select_one(selectors.entity_mailto)
43
  try:
44
- a = 'http://' + a.attrs['href'].split('@')[1]
 
 
 
 
45
 
46
- except TypeError:
47
- print('ERROR', a)
 
 
 
 
 
 
 
 
48
 
49
- e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
50
- writer.writerow(e.to_row())
51
  i+=1
52
  bar.next()
53
  bar.finish()
54
 
55
  shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
56
- print('scrape finished')
 
1
+ #!/usr/bin/env python
2
  import csv
3
  import requests
4
  import shutil
 
15
  soup = BeautifulSoup(page.content, 'html.parser')
16
 
17
  options = soup.find(class_='form-control').find_all('option')
18
+ mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH])
19
 
20
+ i = 0
21
  with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
22
  writer = csv.writer(csvfile)
23
  writer.writerow(Entity.row_names())
24
 
 
25
  bar = ChargingBar('Processing', max=len(options))
26
  for o in options[1:]:
27
+ def get_bco():
28
+ (name, bco)= (o.text, o.attrs['value'])
29
+ page = requests.post(URL, data={'bco': bco})
30
+ soup = BeautifulSoup(page.content, 'html.parser')
31
+ try:
32
+ img = soup.select_one(selectors.logosbancos).attrs['src']
33
+ img = img.replace('../', 'https://www.bcra.gob.ar/')
34
+ fn = f"{defaults.LOGOS_DATA_PATH}/{bco}.0.png"
35
+ web.get_img_logo(img, fn)
36
+ except AttributeError as err:
37
+ print('img', name, err)
38
+ img = None
39
+
40
+ a = soup.select_one(selectors.entity_http)
 
 
 
41
  try:
42
+ a = a.attrs['href']
43
+ except AttributeError:
44
+ a = soup.select_one(selectors.entity_mailto)
45
+ try:
46
+ a = 'http://' + a.attrs['href'].split('@')[1]
47
 
48
+ except TypeError:
49
+ print('ERROR', a)
50
+
51
+ e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
52
+ writer.writerow(e.to_row())
53
+
54
+ try:
55
+ get_bco()
56
+ except Exception as e:
57
+ print(f'Error processing: {e}')
58
 
 
 
59
  i+=1
60
  bar.next()
61
  bar.finish()
62
 
63
  shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
64
+ print(f'scrape finished, found {i} entities, dumped to {defaults.MAIN_CSV_PATH}')
run.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ PY=python3
4
+ echo "πŸ› fetching entities"
5
+ ${PY} ./python/get_entities.py
6
+ echo "🌏 getting vendor data"
7
+ ${PY} ./python/vendor.py --parallel $(cat /proc/cpuinfo | grep processor | wc -l)
8
+ echo "✨ augmenting data"
9
+ ${PY} ./python/augment.py
10
+ echo "πŸ–Ό croping augmented data"
11
+ ${PY} ./python/crop.py ./data/augmented/images
12
+ echo "TODO: 🧠 train model"