Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
•
1a24a58
1
Parent(s):
dd7a9e7
import python
Browse filesSigned-off-by: Niv Sardi <xaiki@evilgiggle.com>
- python/api.py +99 -0
- python/augment.py +64 -18
- python/common/defaults.py +1 -0
- python/common/selectors.py +2 -2
- python/crop.py +31 -10
- python/get_entities.py +26 -38
- python/httpd.py +24 -0
- python/imtool.py +33 -16
- python/main.py +53 -0
- python/markers.py +34 -11
- python/openfish.py +1 -0
- python/pipelines.py +1 -2
- python/screenshot.py +43 -0
- python/split.py +7 -1
- python/test.py +12 -0
- python/train.py +13 -0
- python/vendor.py +40 -13
- python/write_data.py +1 -0
python/api.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from fastapi import FastAPI, WebSocket
|
4 |
+
from YOLOv6.yolov6.core.inferer import Inferer
|
5 |
+
|
6 |
+
import cv2
|
7 |
+
|
8 |
+
import yaml as YAML
|
9 |
+
import json
|
10 |
+
import csv
|
11 |
+
|
12 |
+
import ssl
|
13 |
+
import hashlib
|
14 |
+
|
15 |
+
from entity import read_entities
|
16 |
+
import imtool
|
17 |
+
|
18 |
+
app = FastAPI()
|
19 |
+
|
20 |
+
weights = './runs/train/exp27/weights/best_stop_aug_ckpt.pt'
|
21 |
+
device = 'cpu'
|
22 |
+
yaml = './data.yaml'
|
23 |
+
img_size = [640, 640]
|
24 |
+
half = False
|
25 |
+
conf_thres = 0.5
|
26 |
+
iou_thres = 0.45
|
27 |
+
classes = None
|
28 |
+
agnostic_nms = None
|
29 |
+
max_det = 1000
|
30 |
+
try:
|
31 |
+
with open(yaml, 'r') as f:
|
32 |
+
classes_data = YAML.safe_load(f.read())
|
33 |
+
|
34 |
+
entities = read_entities('../data/entities.csv')
|
35 |
+
|
36 |
+
certs = {}
|
37 |
+
with os.scandir('../data/certs') as it:
|
38 |
+
for entry in it:
|
39 |
+
bco, ext = entry.name.split('.')
|
40 |
+
if ext == 'cert':
|
41 |
+
try:
|
42 |
+
cert_dict = ssl._ssl._test_decode_cert(entry.path)
|
43 |
+
with open(entry.path, 'r') as f:
|
44 |
+
cert_dict.update({
|
45 |
+
'fingerprint': hashlib.sha1(
|
46 |
+
ssl.PEM_cert_to_DER_cert(f.read())
|
47 |
+
).hexdigest()
|
48 |
+
})
|
49 |
+
except Exception as e:
|
50 |
+
print("Error decoding certificate: {:}".format(e))
|
51 |
+
else:
|
52 |
+
name = entities[bco].name
|
53 |
+
certs.update({name: cert_dict})
|
54 |
+
|
55 |
+
|
56 |
+
print(f'loaded {len(certs.keys())} certs, got {len(classes_data["names"])} classes')
|
57 |
+
inferer = Inferer(weights, device, yaml, img_size, half)
|
58 |
+
except Exception as e:
|
59 |
+
print('error', e)
|
60 |
+
|
61 |
+
|
62 |
+
@app.get("/")
|
63 |
+
async def root():
|
64 |
+
return {"message": "API is working"}
|
65 |
+
|
66 |
+
@app.websocket("/ws")
|
67 |
+
async def websockets_cb(websocket: WebSocket):
|
68 |
+
try:
|
69 |
+
await websocket.accept()
|
70 |
+
while True:
|
71 |
+
data = await websocket.receive_text()
|
72 |
+
img = imtool.read_base64(data)
|
73 |
+
cv2.imwrite("debug.png", img)
|
74 |
+
try:
|
75 |
+
os.remove("debug.txt")
|
76 |
+
except:
|
77 |
+
pass
|
78 |
+
|
79 |
+
inferer.load(img)
|
80 |
+
ret = inferer.infer(conf_thres, iou_thres, classes, agnostic_nms, max_det)
|
81 |
+
print(ret)
|
82 |
+
await websocket.send_text(ret + '@@@@' + '[%d,%d,%d]'%img.shape)
|
83 |
+
except Exception as e:
|
84 |
+
print("got: ", e)
|
85 |
+
|
86 |
+
@app.websocket("/bgws")
|
87 |
+
async def send_classes(websocket: WebSocket):
|
88 |
+
await websocket.accept()
|
89 |
+
await websocket.send_text(json.dumps({
|
90 |
+
'classes': classes_data,
|
91 |
+
'certs': certs
|
92 |
+
}))
|
93 |
+
await websocket.close()
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
import uvicorn
|
97 |
+
config = uvicorn.Config("api:app", port=5000, log_level="info")
|
98 |
+
server = uvicorn.Server(config)
|
99 |
+
server.run()
|
python/augment.py
CHANGED
@@ -25,6 +25,48 @@ import pipelines
|
|
25 |
|
26 |
BATCH_SIZE = 16
|
27 |
PARALLEL = 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def process(args):
|
30 |
dest_images_path = os.path.join(args.dest, 'images')
|
@@ -40,9 +82,13 @@ def process(args):
|
|
40 |
reader = csv.DictReader(f)
|
41 |
db = {e.bco: e for e in [Entity.from_dict(d) for d in reader]}
|
42 |
|
43 |
-
background_images = [
|
44 |
-
|
|
|
45 |
|
|
|
|
|
|
|
46 |
stats = {
|
47 |
'failed': 0,
|
48 |
'ok': 0
|
@@ -69,7 +115,6 @@ def process(args):
|
|
69 |
if img.ndim < 3:
|
70 |
print(f'very bad dim: {img.ndim}')
|
71 |
|
72 |
-
img = imtool.remove_white(img)
|
73 |
(h, w, c) = img.shape
|
74 |
|
75 |
assert(w > 10)
|
@@ -95,8 +140,9 @@ def process(args):
|
|
95 |
print(f'error loading: {d.path}: {e}')
|
96 |
|
97 |
print(stats)
|
98 |
-
|
99 |
assert(len(logo_alphas) == len(logo_images))
|
|
|
100 |
|
101 |
# so that we don't get a lot of the same logos on the same page.
|
102 |
zipped = list(zip(logo_images, logo_alphas))
|
@@ -117,7 +163,7 @@ def process(args):
|
|
117 |
|
118 |
batches.append(UnnormalizedBatch(images=a,heatmaps=h))
|
119 |
|
120 |
-
bar = ChargingBar('augment', max=(len(batches)**2)/3*len(background_images))
|
121 |
# We use a single, very fast augmenter here to show that batches
|
122 |
# are only loaded once there is space again in the buffer.
|
123 |
pipeline = pipelines.HUGE
|
@@ -137,16 +183,14 @@ def process(args):
|
|
137 |
for i, batch_aug in enumerate(batches_aug):
|
138 |
idx = list(range(len(batch_aug.images_aug)))
|
139 |
random.shuffle(idx)
|
140 |
-
for j, d in enumerate(background_images):
|
|
|
|
|
141 |
try:
|
142 |
-
|
143 |
except:
|
144 |
-
|
145 |
-
next
|
146 |
-
|
147 |
-
basename = d.name.replace('.png', '') + f'.{i}.{j}'
|
148 |
|
149 |
-
anotations = []
|
150 |
for k in range(math.floor(len(batch_aug.images_aug)/3)):
|
151 |
bar.next()
|
152 |
logo_idx = (j+k*4)%len(batch_aug.images_aug)
|
@@ -165,7 +209,7 @@ def process(args):
|
|
165 |
bb = imtool.mix_alpha(img, logo, alpha[0],
|
166 |
random.random(), random.random())
|
167 |
c = bb.to_centroid(img.shape)
|
168 |
-
|
169 |
except AssertionError as err:
|
170 |
print(f'couldnt process {i}, {j}: {err}')
|
171 |
except Exception as err:
|
@@ -175,7 +219,7 @@ def process(args):
|
|
175 |
cv2.imwrite(f'{dest_images_path}/{basename}.png', img)
|
176 |
label_path = f"{dest_labels_path}/{basename}.txt"
|
177 |
with open(label_path, 'a') as f:
|
178 |
-
f.write('\n'.join(
|
179 |
except Exception:
|
180 |
print(f'couldnt write image {basename}')
|
181 |
|
@@ -186,13 +230,14 @@ def process(args):
|
|
186 |
|
187 |
if __name__ == '__main__':
|
188 |
import argparse
|
189 |
-
|
190 |
parser = argparse.ArgumentParser(description='mix backgrounds and logos into augmented data for YOLO')
|
191 |
parser.add_argument('--logos', metavar='logos', type=str,
|
192 |
default=defaults.LOGOS_DATA_PATH,
|
193 |
help='dir containing logos')
|
194 |
-
parser.add_argument('--
|
195 |
-
|
|
|
196 |
help='dir containing background plates')
|
197 |
parser.add_argument('--dst', dest='dest', type=str,
|
198 |
default=defaults.AUGMENTED_DATA_PATH,
|
@@ -200,6 +245,7 @@ if __name__ == '__main__':
|
|
200 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
201 |
default=PARALLEL,
|
202 |
help='number of concurrent jobs')
|
203 |
-
|
|
|
204 |
args = parser.parse_args()
|
205 |
process(args)
|
|
|
25 |
|
26 |
BATCH_SIZE = 16
|
27 |
PARALLEL = 20
|
28 |
+
MIN_BACKGROUND_SIZE = 500
|
29 |
+
|
30 |
+
def process_bg(b):
|
31 |
+
|
32 |
+
imw = cv2.imread(b.path)
|
33 |
+
im, bb = imtool.remove_white(imw)
|
34 |
+
annot = None
|
35 |
+
label = b.path.replace('png', 'txt')
|
36 |
+
if os.path.exists(label):
|
37 |
+
# rewrite label with new coordinates
|
38 |
+
[ww, wh, _] = imw.shape
|
39 |
+
[iw, ih, _] = im.shape
|
40 |
+
es = imtool.read_centroids(label)
|
41 |
+
l = ''
|
42 |
+
for e in es:
|
43 |
+
[i, p, c] = e.values()
|
44 |
+
[x,y,w,h] = [
|
45 |
+
max((c.x*ww - bb.x)/iw, 0),
|
46 |
+
max((c.y*wh - bb.y)/ih, 0),
|
47 |
+
(c.w*ww)/iw,
|
48 |
+
(c.h*wh)/ih
|
49 |
+
]
|
50 |
+
|
51 |
+
l += f'{int(i)} {x} {y} {w} {h}\n'
|
52 |
+
annot = l
|
53 |
+
|
54 |
+
if im.shape[0] > args.minbgsize and im.shape[1]> args.minbgsize:
|
55 |
+
return im, annot
|
56 |
+
else:
|
57 |
+
raise Exception(f'droping {b.path} after remove_white => {im.shape}')
|
58 |
+
|
59 |
+
def filter_bgs(bgs):
|
60 |
+
ret = []
|
61 |
+
for b in bgs:
|
62 |
+
if b.path.endswith('txt'): continue
|
63 |
+
try:
|
64 |
+
img, annot = process_bg(b)
|
65 |
+
except Exception as e:
|
66 |
+
print(f'drop: {e}')
|
67 |
+
continue
|
68 |
+
ret.append((b, img, annot))
|
69 |
+
return ret
|
70 |
|
71 |
def process(args):
|
72 |
dest_images_path = os.path.join(args.dest, 'images')
|
|
|
82 |
reader = csv.DictReader(f)
|
83 |
db = {e.bco: e for e in [Entity.from_dict(d) for d in reader]}
|
84 |
|
85 |
+
background_images = []
|
86 |
+
for d in args.background:
|
87 |
+
background_images.extend(os.scandir(d))
|
88 |
|
89 |
+
print(f'filtering {len(background_images)} background images from {args.background}')
|
90 |
+
background_images = filter_bgs(background_images)
|
91 |
+
assert(len(background_images))
|
92 |
stats = {
|
93 |
'failed': 0,
|
94 |
'ok': 0
|
|
|
115 |
if img.ndim < 3:
|
116 |
print(f'very bad dim: {img.ndim}')
|
117 |
|
|
|
118 |
(h, w, c) = img.shape
|
119 |
|
120 |
assert(w > 10)
|
|
|
140 |
print(f'error loading: {d.path}: {e}')
|
141 |
|
142 |
print(stats)
|
143 |
+
|
144 |
assert(len(logo_alphas) == len(logo_images))
|
145 |
+
print(f"will process {len(logo_images)} images on {len(background_images)} backgrounds")
|
146 |
|
147 |
# so that we don't get a lot of the same logos on the same page.
|
148 |
zipped = list(zip(logo_images, logo_alphas))
|
|
|
163 |
|
164 |
batches.append(UnnormalizedBatch(images=a,heatmaps=h))
|
165 |
|
166 |
+
bar = ChargingBar(f'augment ({len(logo_images)} logos {len(background_images)} bgs)', max=(len(batches)**2)/3*len(background_images))
|
167 |
# We use a single, very fast augmenter here to show that batches
|
168 |
# are only loaded once there is space again in the buffer.
|
169 |
pipeline = pipelines.HUGE
|
|
|
183 |
for i, batch_aug in enumerate(batches_aug):
|
184 |
idx = list(range(len(batch_aug.images_aug)))
|
185 |
random.shuffle(idx)
|
186 |
+
for j, (d, img, annot) in enumerate(background_images):
|
187 |
+
basename = d.name.replace('.png', f'.{i}.{j}')
|
188 |
+
annotations = []
|
189 |
try:
|
190 |
+
annotations.append(annot.rstrip())
|
191 |
except:
|
192 |
+
pass
|
|
|
|
|
|
|
193 |
|
|
|
194 |
for k in range(math.floor(len(batch_aug.images_aug)/3)):
|
195 |
bar.next()
|
196 |
logo_idx = (j+k*4)%len(batch_aug.images_aug)
|
|
|
209 |
bb = imtool.mix_alpha(img, logo, alpha[0],
|
210 |
random.random(), random.random())
|
211 |
c = bb.to_centroid(img.shape)
|
212 |
+
annotations.append(c.to_annotation(label))
|
213 |
except AssertionError as err:
|
214 |
print(f'couldnt process {i}, {j}: {err}')
|
215 |
except Exception as err:
|
|
|
219 |
cv2.imwrite(f'{dest_images_path}/{basename}.png', img)
|
220 |
label_path = f"{dest_labels_path}/{basename}.txt"
|
221 |
with open(label_path, 'a') as f:
|
222 |
+
f.write('\n'.join(annotations))
|
223 |
except Exception:
|
224 |
print(f'couldnt write image {basename}')
|
225 |
|
|
|
230 |
|
231 |
if __name__ == '__main__':
|
232 |
import argparse
|
233 |
+
print("✨ augmenting data")
|
234 |
parser = argparse.ArgumentParser(description='mix backgrounds and logos into augmented data for YOLO')
|
235 |
parser.add_argument('--logos', metavar='logos', type=str,
|
236 |
default=defaults.LOGOS_DATA_PATH,
|
237 |
help='dir containing logos')
|
238 |
+
parser.add_argument('--background', metavar='backgrounds', type=str,
|
239 |
+
nargs='+',
|
240 |
+
default=[defaults.SCREENSHOT_PATH, defaults.FISH_PATH],
|
241 |
help='dir containing background plates')
|
242 |
parser.add_argument('--dst', dest='dest', type=str,
|
243 |
default=defaults.AUGMENTED_DATA_PATH,
|
|
|
245 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
246 |
default=PARALLEL,
|
247 |
help='number of concurrent jobs')
|
248 |
+
parser.add_argument('--min-background-size', dest='minbgsize', type=int,
|
249 |
+
default=MIN_BACKGROUND_SIZE, help='minimum background size')
|
250 |
args = parser.parse_args()
|
251 |
process(args)
|
python/common/defaults.py
CHANGED
@@ -18,6 +18,7 @@ SQUARES_IMAGES_PATH = D('SQUARES_IMAGES_PATH', f'{SQUARES_DATA_PATH}/images')
|
|
18 |
|
19 |
DEBUG_PATH = D('DEBUG_PATH', f'{DATA_PATH}/debug')
|
20 |
DEBUG_SQUARES_PATH = D('DEBUG_SQUARES_PATH', f'{DEBUG_PATH}/squares')
|
|
|
21 |
|
22 |
LOGOS_DATA_PATH = D('LOGOS_DATA_PATH', f'{DATA_PATH}/logos')
|
23 |
|
|
|
18 |
|
19 |
DEBUG_PATH = D('DEBUG_PATH', f'{DATA_PATH}/debug')
|
20 |
DEBUG_SQUARES_PATH = D('DEBUG_SQUARES_PATH', f'{DEBUG_PATH}/squares')
|
21 |
+
LOG_PATH = D('LOG_PATH', f'{DATA_PATH}/logs')
|
22 |
|
23 |
LOGOS_DATA_PATH = D('LOGOS_DATA_PATH', f'{DATA_PATH}/logos')
|
24 |
|
python/common/selectors.py
CHANGED
@@ -6,5 +6,5 @@ cls_logo = "*[class*=logo]"
|
|
6 |
|
7 |
logosbancos = "img[src*=logosbancos]"
|
8 |
|
9 |
-
entity_http = "p.post-pagina-interior
|
10 |
-
entity_mailto = "p.post-pagina-interior
|
|
|
6 |
|
7 |
logosbancos = "img[src*=logosbancos]"
|
8 |
|
9 |
+
entity_http = "p.post-pagina-interior [href*=http]"
|
10 |
+
entity_mailto = "p.post-pagina-interior [href*=mailto]"
|
python/crop.py
CHANGED
@@ -1,26 +1,47 @@
|
|
1 |
import os
|
2 |
import argparse
|
3 |
import imtool
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
parser = argparse.ArgumentParser(description='crop images to train YOLO on squares')
|
6 |
parser.add_argument('src', metavar='dir', type=str, nargs='+',
|
7 |
help='dir containing the images')
|
8 |
parser.add_argument('--dst', dest='dst', type=str, default='./data/squares',
|
9 |
help='dest dir')
|
|
|
|
|
|
|
10 |
|
11 |
args = parser.parse_args()
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
for d in args.src:
|
14 |
-
i = 0
|
15 |
with os.scandir(d) as it:
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
try:
|
21 |
-
i+=1
|
22 |
-
bco, boxes = imtool.read_centroids(label)
|
23 |
-
imtool.crop(bco, e.path, boxes, args.dst)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
except Exception as err:
|
26 |
-
print(err)
|
|
|
|
|
|
1 |
import os
|
2 |
import argparse
|
3 |
import imtool
|
4 |
+
from progress.bar import ChargingBar
|
5 |
+
import concurrent.futures
|
6 |
+
|
7 |
+
PARALLEL = 30
|
8 |
+
print("🖼 croping augmented data")
|
9 |
|
10 |
parser = argparse.ArgumentParser(description='crop images to train YOLO on squares')
|
11 |
parser.add_argument('src', metavar='dir', type=str, nargs='+',
|
12 |
help='dir containing the images')
|
13 |
parser.add_argument('--dst', dest='dst', type=str, default='./data/squares',
|
14 |
help='dest dir')
|
15 |
+
parser.add_argument('--parallel', metavar='parallel', type=int,
|
16 |
+
default=PARALLEL,
|
17 |
+
help='number of concurrent jobs')
|
18 |
|
19 |
args = parser.parse_args()
|
20 |
|
21 |
+
def process(e):
|
22 |
+
if e.name.endswith('.png') and e.is_file():
|
23 |
+
# print(e.name)
|
24 |
+
label = e.path.replace('images', 'labels').replace('.png', '.txt')
|
25 |
+
try:
|
26 |
+
id, boxes = imtool.read_centroids(label)
|
27 |
+
imtool.crop(id, e.path, boxes, args.dst)
|
28 |
+
|
29 |
+
except Exception as err:
|
30 |
+
print(err)
|
31 |
+
|
32 |
for d in args.src:
|
|
|
33 |
with os.scandir(d) as it:
|
34 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers = args.parallel) as executor:
|
35 |
+
futures = {executor.submit(process, e): e for e in it}
|
36 |
+
count = len(futures.keys())
|
37 |
+
bar = ChargingBar('crop', max=count)
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
print('waiting for futures')
|
40 |
+
for f in concurrent.futures.as_completed(futures):
|
41 |
+
e = futures[f]
|
42 |
+
try:
|
43 |
+
f.result()
|
44 |
except Exception as err:
|
45 |
+
print(f'{a}({e}) generated an exception: {err}')
|
46 |
+
bar.next()
|
47 |
+
bar.finish()
|
python/get_entities.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2 |
import csv
|
3 |
import requests
|
4 |
import shutil
|
|
|
5 |
|
6 |
from bs4 import BeautifulSoup
|
7 |
from progress.bar import ChargingBar
|
|
|
8 |
|
9 |
import web
|
10 |
from entity import Entity
|
@@ -17,51 +19,37 @@ soup = BeautifulSoup(page.content, 'html.parser')
|
|
17 |
options = soup.find(class_='form-control').find_all('option')
|
18 |
mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH])
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
22 |
writer = csv.writer(csvfile)
|
23 |
writer.writerow(Entity.row_names())
|
24 |
|
25 |
bar = ChargingBar('get entities', max=len(options))
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
img = soup.select_one(selectors.logosbancos).attrs['src']
|
35 |
-
img = img.replace('../', 'https://www.bcra.gob.ar/')
|
36 |
-
fn = f"{defaults.LOGOS_DATA_PATH}/{bco}.0.png"
|
37 |
-
web.get_img_logo(img, fn)
|
38 |
-
except AttributeError as err:
|
39 |
-
print(f'couldnt extract image from {img}: {err}')
|
40 |
-
img = None
|
41 |
-
|
42 |
-
a = soup.select_one(selectors.entity_http)
|
43 |
-
try:
|
44 |
-
assert(a)
|
45 |
-
a = a.attrs['href']
|
46 |
-
except AttributeError:
|
47 |
-
a = soup.select_one(selectors.entity_mailto)
|
48 |
-
try:
|
49 |
-
a = 'http://' + a.attrs['href'].split('@')[1]
|
50 |
-
|
51 |
-
except TypeError:
|
52 |
-
print('ERROR', a)
|
53 |
-
|
54 |
-
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
55 |
-
writer.writerow(e.to_row())
|
56 |
-
|
57 |
-
try:
|
58 |
-
get_bco()
|
59 |
-
except Exception as e:
|
60 |
-
print(f'Error processing: {o.url}')
|
61 |
-
|
62 |
i+=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
bar.next()
|
64 |
bar.finish()
|
65 |
|
66 |
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
67 |
-
print(f'scrape finished, found {
|
|
|
2 |
import csv
|
3 |
import requests
|
4 |
import shutil
|
5 |
+
import re
|
6 |
|
7 |
from bs4 import BeautifulSoup
|
8 |
from progress.bar import ChargingBar
|
9 |
+
import concurrent.futures
|
10 |
|
11 |
import web
|
12 |
from entity import Entity
|
|
|
19 |
options = soup.find(class_='form-control').find_all('option')
|
20 |
mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH])
|
21 |
|
22 |
+
def get_links(soup):
|
23 |
+
for l in soup.select('.post-pagina-interior'):
|
24 |
+
for a in l.select('a'):
|
25 |
+
if 'href' in a.attrs and a.attrs['href'].startswith('http'):
|
26 |
+
return a.attrs['href']
|
27 |
+
|
28 |
+
|
29 |
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
30 |
writer = csv.writer(csvfile)
|
31 |
writer.writerow(Entity.row_names())
|
32 |
|
33 |
bar = ChargingBar('get entities', max=len(options))
|
34 |
+
def get_bco(o, i):
|
35 |
+
(name, bco)= (o.text, o.attrs['value'])
|
36 |
+
|
37 |
+
page = requests.post(URL, data={'bco': bco}, stream=False)
|
38 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
39 |
+
img = f'https://www.bcra.gob.ar/Imagenes/logosbancos/{bco}.jpg'
|
40 |
+
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(get_links(soup)))
|
41 |
+
writer.writerow(e.to_row())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
i+=1
|
43 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
|
44 |
+
futures = {executor.submit(get_bco, o, i): o for (i, o) in enumerate(options[1:])}
|
45 |
+
for f in concurrent.futures.as_completed(futures):
|
46 |
+
o = futures[f]
|
47 |
+
try:
|
48 |
+
f.result()
|
49 |
+
except Exception as err:
|
50 |
+
print(f'({o}) generated an exception: {err}')
|
51 |
bar.next()
|
52 |
bar.finish()
|
53 |
|
54 |
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
55 |
+
print(f'scrape finished, found {len(options[1:])} entities, dumped to {defaults.MAIN_CSV_PATH}')
|
python/httpd.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
import http.server
|
3 |
+
import ssl
|
4 |
+
import threading
|
5 |
+
|
6 |
+
def launch_httpd(httpd):
|
7 |
+
print(f'launch {httpd.socket}')
|
8 |
+
httpd.serve_forever()
|
9 |
+
|
10 |
+
def make_httpd(port):
|
11 |
+
return http.server.HTTPServer(('0.0.0.0', port), http.server.SimpleHTTPRequestHandler)
|
12 |
+
|
13 |
+
[httpd, httpsd] = [make_httpd(p) for p in [8080, 8443]]
|
14 |
+
|
15 |
+
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
16 |
+
ctx.load_cert_chain('./cert.pem', keyfile='./privatekey.pem')
|
17 |
+
ctx.check_hostname = False
|
18 |
+
|
19 |
+
httpsd.socket = ctx.wrap_socket(sock=httpsd.socket, server_side=True)
|
20 |
+
|
21 |
+
for h in [httpd, httpsd]:
|
22 |
+
t = threading.Thread(target=launch_httpd, args=(h,))
|
23 |
+
t.start()
|
24 |
+
|
python/imtool.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
import os
|
4 |
import math
|
5 |
import cv2
|
|
|
6 |
import numpy as np
|
7 |
from typing import NamedTuple, Tuple, List
|
8 |
|
@@ -32,6 +33,11 @@ class BoundingBox(NamedTuple):
|
|
32 |
self = cls(x=d['x'], y=d['y'], w=d['width'], h=d['height'])
|
33 |
return self
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
@property
|
36 |
def start(self):
|
37 |
return floor_point(self.x, self.y)
|
@@ -86,25 +92,33 @@ class Centroid(BoundingBox):
|
|
86 |
, w=math.ceil(w*self.w)
|
87 |
, h=math.ceil(h*self.h))
|
88 |
|
89 |
-
def
|
90 |
return f'{id} {self.x} {self.y} {self.w} {self.h}'
|
91 |
|
92 |
-
def
|
|
|
|
|
|
|
|
|
|
|
93 |
ret = []
|
94 |
-
bco = None
|
95 |
with open(filename, 'r') as f:
|
96 |
lines = f.readlines()
|
97 |
for l in lines:
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def read_centroids(filename: str):
|
107 |
-
return
|
108 |
|
109 |
def coord_dict_to_point(c: dict):
|
110 |
return coord_to_point(c['x'], c['y'], c['width'], c['height'])
|
@@ -138,10 +152,11 @@ def remove_white(img):
|
|
138 |
gray = cv2.cvtColor(img, cv2.COLOR_BGRA2GRAY)
|
139 |
gray = 255*(gray<128)
|
140 |
coords = cv2.findNonZero(gray)
|
141 |
-
|
142 |
-
|
|
|
143 |
|
144 |
-
return rect
|
145 |
|
146 |
|
147 |
def mix(a, b, fx, fy):
|
@@ -157,7 +172,7 @@ def mix_alpha(a, b, ba, fx, fy):
|
|
157 |
if (aw*p < bw or ah*p < bh):
|
158 |
f = min(p*aw/bw, p*ah/bh)
|
159 |
nw, nh = floor_point(bw*f, bh*f)
|
160 |
-
#
|
161 |
r = cv2.resize(b, (nw, nh), interpolation = cv2.INTER_LINEAR)
|
162 |
rba = cv2.resize(ba, (nw, nh), interpolation = cv2.INTER_LINEAR)
|
163 |
|
@@ -181,13 +196,15 @@ def _mix_alpha(a, b, ba, fx, fy):
|
|
181 |
mask = np.dstack((ba, ba, ba))
|
182 |
|
183 |
a[y:y+bh,x:x+bw] = mat * (1 - mask) + cols * mask
|
|
|
184 |
|
185 |
return BoundingBox(x, y, bw, bh)
|
186 |
|
187 |
-
def crop(id, fn, logos: List[Centroid], out = './data/squares'
|
188 |
basename = os.path.basename(fn).replace('.png', '')
|
189 |
img_out = f"{out}/images"
|
190 |
txt_out = f"{out}/labels"
|
|
|
191 |
mkdir.make_dirs([debug_out, img_out, txt_out])
|
192 |
|
193 |
im = cv2.imread(fn)
|
|
|
3 |
import os
|
4 |
import math
|
5 |
import cv2
|
6 |
+
import base64
|
7 |
import numpy as np
|
8 |
from typing import NamedTuple, Tuple, List
|
9 |
|
|
|
33 |
self = cls(x=d['x'], y=d['y'], w=d['width'], h=d['height'])
|
34 |
return self
|
35 |
|
36 |
+
@classmethod
|
37 |
+
def from_arr(cls, a):
|
38 |
+
self = cls(*a)
|
39 |
+
return self
|
40 |
+
|
41 |
@property
|
42 |
def start(self):
|
43 |
return floor_point(self.x, self.y)
|
|
|
92 |
, w=math.ceil(w*self.w)
|
93 |
, h=math.ceil(h*self.h))
|
94 |
|
95 |
+
def to_annotation(self, id: int):
|
96 |
return f'{id} {self.x} {self.y} {self.w} {self.h}'
|
97 |
|
98 |
+
def read_base64(data):
|
99 |
+
ib = base64.b64decode(data[22:])
|
100 |
+
arr = np.frombuffer(ib, dtype = np.uint8)
|
101 |
+
return cv2.imdecode(arr, flags=cv2.IMREAD_COLOR)
|
102 |
+
|
103 |
+
def read_markers(filename: str, Type: type):
|
104 |
ret = []
|
|
|
105 |
with open(filename, 'r') as f:
|
106 |
lines = f.readlines()
|
107 |
for l in lines:
|
108 |
+
try:
|
109 |
+
(b, x,y,w,h, p) = [float(i) for i in l.split(' ')]
|
110 |
+
except:
|
111 |
+
try:
|
112 |
+
(b, x,y,w,h) = [float(i) for i in l.split(' ')]
|
113 |
+
except:
|
114 |
+
continue
|
115 |
+
p = -1
|
116 |
+
ret.append({"class": b, "prob": p, "box": Type(x,y,w,h)})
|
117 |
+
assert(len(ret))
|
118 |
+
return ret
|
119 |
|
120 |
def read_centroids(filename: str):
|
121 |
+
return read_markers(filename, Centroid)
|
122 |
|
123 |
def coord_dict_to_point(c: dict):
|
124 |
return coord_to_point(c['x'], c['y'], c['width'], c['height'])
|
|
|
152 |
gray = cv2.cvtColor(img, cv2.COLOR_BGRA2GRAY)
|
153 |
gray = 255*(gray<128)
|
154 |
coords = cv2.findNonZero(gray)
|
155 |
+
# Find minimum spanning bounding box
|
156 |
+
bb = BoundingBox(*cv2.boundingRect(coords))
|
157 |
+
rect = img[bb.y:bb.y+bb.h, bb.x:bb.x+bb.w] # Crop the image - note we do this on the original image
|
158 |
|
159 |
+
return rect, bb
|
160 |
|
161 |
|
162 |
def mix(a, b, fx, fy):
|
|
|
172 |
if (aw*p < bw or ah*p < bh):
|
173 |
f = min(p*aw/bw, p*ah/bh)
|
174 |
nw, nh = floor_point(bw*f, bh*f)
|
175 |
+
#print(f'resizing to fit in {aw}x{ah}\t {bw}x{bh}\t=> {nw}x{nh}\tfactor {f}')
|
176 |
r = cv2.resize(b, (nw, nh), interpolation = cv2.INTER_LINEAR)
|
177 |
rba = cv2.resize(ba, (nw, nh), interpolation = cv2.INTER_LINEAR)
|
178 |
|
|
|
196 |
mask = np.dstack((ba, ba, ba))
|
197 |
|
198 |
a[y:y+bh,x:x+bw] = mat * (1 - mask) + cols * mask
|
199 |
+
#a[y:y+bh,x:x+bw] = cols
|
200 |
|
201 |
return BoundingBox(x, y, bw, bh)
|
202 |
|
203 |
+
def crop(id, fn, logos: List[Centroid], out = './data/squares'):
|
204 |
basename = os.path.basename(fn).replace('.png', '')
|
205 |
img_out = f"{out}/images"
|
206 |
txt_out = f"{out}/labels"
|
207 |
+
debug_out = f"{defaults.DEBUG_PATH}/{out}"
|
208 |
mkdir.make_dirs([debug_out, img_out, txt_out])
|
209 |
|
210 |
im = cv2.imread(fn)
|
python/main.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import requests
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from progress.bar import ChargingBar
|
7 |
+
|
8 |
+
from entity import Entity
|
9 |
+
from common import selectors, defaults, mkdir
|
10 |
+
|
11 |
+
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
12 |
+
page = requests.get(URL)
|
13 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
14 |
+
|
15 |
+
options = soup.find(class_='form-control').find_all('option')
|
16 |
+
mkdir.make_dirs([defaults.DATA_PATH])
|
17 |
+
|
18 |
+
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
19 |
+
writer = csv.writer(csvfile)
|
20 |
+
writer.writerow(Entity.row_names())
|
21 |
+
|
22 |
+
i = 0
|
23 |
+
bar = ChargingBar('Processing', max=len(options))
|
24 |
+
for o in options[1:]:
|
25 |
+
(name, bco)= (o.text, o.attrs['value'])
|
26 |
+
page = requests.post(URL, data={'bco': bco})
|
27 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
28 |
+
try:
|
29 |
+
img = soup.select_one(selectors.logosbancos).attrs['src']
|
30 |
+
img = img.replace('../', 'https://www.bcra.gob.ar/')
|
31 |
+
except AttributeError as err:
|
32 |
+
print('img', name, err)
|
33 |
+
img = None
|
34 |
+
|
35 |
+
a = soup.select_one(selectors.entity_http)
|
36 |
+
try:
|
37 |
+
a = a.attrs['href']
|
38 |
+
except AttributeError:
|
39 |
+
a = soup.select_one(selectors.entity_mailto)
|
40 |
+
try:
|
41 |
+
a = 'http://' + a.attrs['href'].split('@')[1]
|
42 |
+
|
43 |
+
except TypeError:
|
44 |
+
print('ERROR', a)
|
45 |
+
|
46 |
+
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
47 |
+
writer.writerow(e.to_row())
|
48 |
+
i+=1
|
49 |
+
bar.next()
|
50 |
+
bar.finish()
|
51 |
+
|
52 |
+
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
53 |
+
print('scrape finished')
|
python/markers.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import cv2
|
2 |
import argparse
|
3 |
import imtool
|
@@ -7,15 +8,37 @@ parser.add_argument('pngs', metavar='img.png', type=str, nargs='+',
|
|
7 |
help='images to debug')
|
8 |
args = parser.parse_args()
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
label = i.replace('images', 'labels').replace('.png', '.txt').replace('.jpg', '.txt')
|
13 |
-
bco, ccs = imtool.read_centroids(label)
|
14 |
-
bbs = [c.to_bounding_box(im.shape) for c in ccs]
|
15 |
-
for i,b in enumerate(bbs):
|
16 |
-
c = (100, 255*i/len(bbs), 255*(1 - i/len(bbs)))
|
17 |
-
cv2.rectangle(im, b.start, b.end, c, 5)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import cv2
|
3 |
import argparse
|
4 |
import imtool
|
|
|
8 |
help='images to debug')
|
9 |
args = parser.parse_args()
|
10 |
|
11 |
+
if len(args.pngs) and os.path.isdir(args.pngs[0]):
|
12 |
+
args.pngs = [d.path for d in os.scandir(args.pngs[0])]
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def process():
|
15 |
+
for i in args.pngs:
|
16 |
+
if i.endswith('txt'): continue
|
17 |
+
im = cv2.imread(i)
|
18 |
+
|
19 |
+
try:
|
20 |
+
assert(im.shape)
|
21 |
+
except AttributeError:
|
22 |
+
print(f'couldnt parse {i}')
|
23 |
+
continue
|
24 |
+
|
25 |
+
label = i.replace('images', 'labels').replace('.png', '.txt').replace('.jpg', '.txt')
|
26 |
+
print(i)
|
27 |
+
try:
|
28 |
+
results = imtool.read_centroids(label)
|
29 |
+
except FileNotFoundError:
|
30 |
+
continue
|
31 |
+
except Exception as e:
|
32 |
+
print(f'error handeling {i}', e)
|
33 |
+
continue
|
34 |
+
bbs = [r["box"].to_bounding_box(im.shape) for r in results]
|
35 |
+
for i,b in enumerate(bbs):
|
36 |
+
print(b)
|
37 |
+
c = (100, 255*i/len(bbs), 255*(1 - i/len(bbs)))
|
38 |
+
cv2.rectangle(im, b.start, b.end, c, 5)
|
39 |
+
|
40 |
+
cv2.imshow('result', im)
|
41 |
+
cv2.waitKey(0)
|
42 |
+
cv2.destroyAllWindows()
|
43 |
+
|
44 |
+
process()
|
python/openfish.py
CHANGED
@@ -43,6 +43,7 @@ def download_all(feed, n_workers=PARALLEL, dest=defaults.FISH_PATH):
|
|
43 |
if __name__ == '__main__':
|
44 |
import argparse
|
45 |
|
|
|
46 |
parser = argparse.ArgumentParser(description='screenshot openfish open list')
|
47 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
48 |
default=PARALLEL,
|
|
|
43 |
if __name__ == '__main__':
|
44 |
import argparse
|
45 |
|
46 |
+
print("☠ getting extra backgrounds from OpenFish")
|
47 |
parser = argparse.ArgumentParser(description='screenshot openfish open list')
|
48 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
49 |
default=PARALLEL,
|
python/pipelines.py
CHANGED
@@ -13,8 +13,7 @@ sometimes = lambda aug: iaa.Sometimes(0.2, aug)
|
|
13 |
HUGE = sometimes(iaa.Sequential(
|
14 |
[
|
15 |
# apply the following augmenters to most images
|
16 |
-
iaa.
|
17 |
-
iaa.Flipud(0.2), # vertically flip 20% of all images
|
18 |
# crop images by -5% to 10% of their height/width
|
19 |
sometimes(iaa.CropAndPad(
|
20 |
percent=(-0.05, 0.1),
|
|
|
13 |
HUGE = sometimes(iaa.Sequential(
|
14 |
[
|
15 |
# apply the following augmenters to most images
|
16 |
+
sometimes(iaa.Cartoon()),
|
|
|
17 |
# crop images by -5% to 10% of their height/width
|
18 |
sometimes(iaa.CropAndPad(
|
19 |
percent=(-0.05, 0.1),
|
python/screenshot.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
#
|
3 |
+
import math
|
4 |
+
|
5 |
+
from selenium import webdriver
|
6 |
+
from selenium.webdriver.common.keys import Keys
|
7 |
+
from selenium.webdriver.common.by import By
|
8 |
+
|
9 |
+
from common import selectors
|
10 |
+
from entity import Entity
|
11 |
+
from common import defaults,mkdir
|
12 |
+
|
13 |
+
options = webdriver.FirefoxOptions()
|
14 |
+
options.add_argument("--headless")
|
15 |
+
options.add_argument("--window-size=1920x8000")
|
16 |
+
|
17 |
+
def coord_to_point(c):
|
18 |
+
x = math.floor(c['x'] + c['width']/2)
|
19 |
+
y = math.floor(c['y'] + c['height']/2)
|
20 |
+
return f"{x} {y} {math.ceil(c['width'])} {math.ceil(c['height'])}"
|
21 |
+
|
22 |
+
driver = webdriver.Firefox(options=options)
|
23 |
+
def sc_entity(e: Entity):
|
24 |
+
print(f'screenshoting: {e}')
|
25 |
+
mkdir.make_dirs([
|
26 |
+
defaults.IMAGES_PATH,
|
27 |
+
defaults.LABELS_PATH,
|
28 |
+
])
|
29 |
+
|
30 |
+
driver.implicitly_wait(10)
|
31 |
+
driver.get(e.url)
|
32 |
+
#driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
33 |
+
driver.save_full_page_screenshot(f"{defaults.IMAGES_PATH}/{e.bco}.full.png")
|
34 |
+
|
35 |
+
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
36 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
37 |
+
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
38 |
+
with open(f"{defaults.LABELS_PATH}/{e.bco}.full.txt", 'w') as f:
|
39 |
+
for i in logos:
|
40 |
+
f.write(f"{e.id} {coord_to_point(i.rect)}\n")
|
41 |
+
|
42 |
+
if __name__ == '__main__':
|
43 |
+
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
python/split.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
import os
|
3 |
import math
|
4 |
from common import defaults, mkdir
|
5 |
-
|
6 |
PATHS = {
|
7 |
6: {
|
8 |
'images': lambda dest, d: os.path.join(dest, 'images', d ),
|
@@ -16,6 +15,8 @@ PATHS = {
|
|
16 |
|
17 |
if __name__ == '__main__':
|
18 |
import argparse
|
|
|
|
|
19 |
parser = argparse.ArgumentParser(description='splits a yolo dataset between different data partitions')
|
20 |
parser.add_argument('datapath', metavar='datapath', type=str,
|
21 |
help='csv file', default=defaults.SQUARES_DATA_PATH)
|
@@ -49,9 +50,14 @@ if __name__ == '__main__':
|
|
49 |
|
50 |
mkdir.make_dirs([cpi, cpl])
|
51 |
print( f'{d:6s} [ {p:6d}, {np:6d} ] ({np-p:6d}:{(np-p)/len(images):0.2f} )')
|
|
|
|
|
52 |
for si in images[p:np]:
|
|
|
53 |
l = image_to_label(si.path)
|
54 |
os.symlink(os.path.join(rpi, si.name), os.path.join(cpi, si.name))
|
55 |
if l:
|
|
|
56 |
nl = os.path.basename(l)
|
57 |
os.symlink(os.path.join(rpl, nl), os.path.join(cpl, nl))
|
|
|
|
2 |
import os
|
3 |
import math
|
4 |
from common import defaults, mkdir
|
|
|
5 |
PATHS = {
|
6 |
6: {
|
7 |
'images': lambda dest, d: os.path.join(dest, 'images', d ),
|
|
|
15 |
|
16 |
if __name__ == '__main__':
|
17 |
import argparse
|
18 |
+
print("✂ split dataset into train, val and test groups")
|
19 |
+
|
20 |
parser = argparse.ArgumentParser(description='splits a yolo dataset between different data partitions')
|
21 |
parser.add_argument('datapath', metavar='datapath', type=str,
|
22 |
help='csv file', default=defaults.SQUARES_DATA_PATH)
|
|
|
50 |
|
51 |
mkdir.make_dirs([cpi, cpl])
|
52 |
print( f'{d:6s} [ {p:6d}, {np:6d} ] ({np-p:6d}:{(np-p)/len(images):0.2f} )')
|
53 |
+
|
54 |
+
stats = {'images': 0, 'labels': 0}
|
55 |
for si in images[p:np]:
|
56 |
+
stats['images'] += 1
|
57 |
l = image_to_label(si.path)
|
58 |
os.symlink(os.path.join(rpi, si.name), os.path.join(cpi, si.name))
|
59 |
if l:
|
60 |
+
stats['labels'] +=1
|
61 |
nl = os.path.basename(l)
|
62 |
os.symlink(os.path.join(rpl, nl), os.path.join(cpl, nl))
|
63 |
+
print(stats)
|
python/test.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import logging
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
|
6 |
+
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
7 |
+
page = requests.post(URL, data={'bco': '00331'}, stream=False)
|
8 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
9 |
+
for l in soup.select('.post-pagina-interior'):
|
10 |
+
print(l)
|
11 |
+
for a in l.select('a'):
|
12 |
+
print(a)
|
python/train.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import yaml
|
2 |
+
from entities import read_entities
|
3 |
+
|
4 |
+
entities = read_entities()
|
5 |
+
|
6 |
+
with open(r'/content/yolov5/data.yaml') as file:
|
7 |
+
# The FullLoader parameter handles the conversion from YAML
|
8 |
+
# scalar values to Python the dictionary format
|
9 |
+
labels_list = yaml.load(file, Loader=yaml.FullLoader)
|
10 |
+
|
11 |
+
label_names = labels_list['names']
|
12 |
+
|
13 |
+
print("Number of Classes are {}, whose labels are {} for this Object Detection project".format(num_classes,label_names))
|
python/vendor.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
-
import csv
|
3 |
import concurrent.futures
|
4 |
import requests
|
5 |
|
6 |
from progress.bar import ChargingBar
|
7 |
|
8 |
-
from entity import Entity
|
9 |
from common import defaults,mkdir
|
10 |
import web
|
11 |
|
12 |
PARALLEL = 20
|
13 |
|
14 |
def do_screenshot(e: Entity):
|
|
|
15 |
sfn = requests.post('http://puppet:8000/screenshot', json={
|
16 |
'url': e.url,
|
17 |
'id': e.id,
|
@@ -19,18 +19,33 @@ def do_screenshot(e: Entity):
|
|
19 |
'logos': f'{defaults.LOGOS_DATA_PATH}/{e.bco}.png'
|
20 |
})
|
21 |
|
22 |
-
|
|
|
|
|
23 |
|
24 |
-
def from_csv(
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
futures = {}
|
30 |
-
entities =
|
31 |
-
|
|
|
32 |
|
33 |
-
for e in entities:
|
34 |
futures.update({executor.submit(f, e): (e, f) for f in ACTIONS})
|
35 |
print('waiting for futures')
|
36 |
|
@@ -48,7 +63,7 @@ def from_csv(fn: str, n_workers = PARALLEL):
|
|
48 |
|
49 |
if __name__ == '__main__':
|
50 |
import argparse
|
51 |
-
|
52 |
parser = argparse.ArgumentParser(description='extract certificates and screenshots websites')
|
53 |
parser.add_argument('--csv', metavar='csv', type=str,
|
54 |
default=defaults.MAIN_CSV_PATH,
|
@@ -56,6 +71,18 @@ if __name__ == '__main__':
|
|
56 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
57 |
default=PARALLEL,
|
58 |
help='number of concurrent jobs')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
args = parser.parse_args()
|
61 |
-
from_csv(args
|
|
|
1 |
#!/usr/bin/env python3
|
|
|
2 |
import concurrent.futures
|
3 |
import requests
|
4 |
|
5 |
from progress.bar import ChargingBar
|
6 |
|
7 |
+
from entity import Entity, read_entities
|
8 |
from common import defaults,mkdir
|
9 |
import web
|
10 |
|
11 |
PARALLEL = 20
|
12 |
|
13 |
def do_screenshot(e: Entity):
|
14 |
+
assert(e.url)
|
15 |
sfn = requests.post('http://puppet:8000/screenshot', json={
|
16 |
'url': e.url,
|
17 |
'id': e.id,
|
|
|
19 |
'logos': f'{defaults.LOGOS_DATA_PATH}/{e.bco}.png'
|
20 |
})
|
21 |
|
22 |
+
def get_entity_logo(e: Entity):
|
23 |
+
fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.0.png"
|
24 |
+
web.get_img_logo(e.logo, fn)
|
25 |
|
26 |
+
def from_csv(args):
|
27 |
+
ACTIONS = []
|
28 |
+
if (args.certs):
|
29 |
+
ACTIONS.append(web.get_cert)
|
30 |
+
mkdir.make_dirs([defaults.CERTS_PATH])
|
31 |
+
if (args.logos):
|
32 |
+
ACTIONS.append(web.get_logos)
|
33 |
+
mkdir.make_dirs([defaults.LOGOS_DATA_PATH])
|
34 |
+
if (args.screenshots):
|
35 |
+
ACTIONS.append(do_screenshot)
|
36 |
+
mkdir.make_dirs([defaults.SCREENSHOT_PATH])
|
37 |
+
if (args.entity_logo):
|
38 |
+
ACTIONS.append(get_entity_logo)
|
39 |
+
mkdir.make_dirs([defaults.LOGOS_DATA_PATH])
|
40 |
+
|
41 |
+
print(ACTIONS)
|
42 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers = args.parallel) as executor:
|
43 |
futures = {}
|
44 |
+
entities = read_entities(args.csv)
|
45 |
+
qs = len(entities.keys())*len(ACTIONS)
|
46 |
+
bar = ChargingBar(f'vendor ({qs} jobs)', max=qs)
|
47 |
|
48 |
+
for e in entities.values():
|
49 |
futures.update({executor.submit(f, e): (e, f) for f in ACTIONS})
|
50 |
print('waiting for futures')
|
51 |
|
|
|
63 |
|
64 |
if __name__ == '__main__':
|
65 |
import argparse
|
66 |
+
print("🌏 getting vendor data")
|
67 |
parser = argparse.ArgumentParser(description='extract certificates and screenshots websites')
|
68 |
parser.add_argument('--csv', metavar='csv', type=str,
|
69 |
default=defaults.MAIN_CSV_PATH,
|
|
|
71 |
parser.add_argument('--parallel', metavar='parallel', type=int,
|
72 |
default=PARALLEL,
|
73 |
help='number of concurrent jobs')
|
74 |
+
parser.add_argument('--logos', metavar='logos', type=bool,
|
75 |
+
action=argparse.BooleanOptionalAction,
|
76 |
+
default=True, help='try to get logos')
|
77 |
+
parser.add_argument('--entity-logo', metavar='entity_logo', type=bool,
|
78 |
+
action=argparse.BooleanOptionalAction,
|
79 |
+
default=True, help='try to get logos form ENTITY')
|
80 |
+
parser.add_argument('--certs', metavar='certs', type=bool,
|
81 |
+
action=argparse.BooleanOptionalAction,
|
82 |
+
default=True, help='try to get certs')
|
83 |
+
parser.add_argument('--screenshots', metavar='screenshots', type=bool,
|
84 |
+
action=argparse.BooleanOptionalAction,
|
85 |
+
default=True, help='try to get screenshots')
|
86 |
|
87 |
args = parser.parse_args()
|
88 |
+
from_csv(args)
|
python/write_data.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import csv
|
2 |
import entity
|
3 |
import argparse
|
|
|
1 |
+
import os
|
2 |
import csv
|
3 |
import entity
|
4 |
import argparse
|