Niv Sardi commited on
Commit
fc32112
1 Parent(s): f7e5bce

add more euristics to find logos, might be too much

Browse files

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

crawler/common/selectors.py CHANGED
@@ -1,6 +1,9 @@
1
  #!/usr/bin/env python3
2
 
3
- logo = "img[src*=logo]"
 
 
 
4
  logosbancos = "img[src*=logosbancos]"
5
 
6
  entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
 
1
  #!/usr/bin/env python3
2
 
3
+ img_logo = "img[src*=logo]"
4
+ id_logo = "*[id*=logo]"
5
+ cls_logo = "*[class*=logo]"
6
+
7
  logosbancos = "img[src*=logosbancos]"
8
 
9
  entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
crawler/imtool.py CHANGED
@@ -9,7 +9,7 @@ from typing import NamedTuple
9
  from entity import Entity
10
 
11
  TILE_SIZE = 800
12
- TILE_OVERLAP = 0.2
13
 
14
  class BoundingBox(NamedTuple):
15
  x: float = 0.0
 
9
  from entity import Entity
10
 
11
  TILE_SIZE = 800
12
+ TILE_OVERLAP = 0.8
13
 
14
  class BoundingBox(NamedTuple):
15
  x: float = 0.0
crawler/screenshot.py CHANGED
@@ -27,7 +27,9 @@ def sc_entity(e: Entity):
27
  driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
28
  driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
29
 
30
- logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
 
 
31
  with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
32
  for i in logos:
33
  f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
 
27
  driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
28
  driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
29
 
30
+ logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
31
+ logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
32
+ logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
33
  with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
34
  for i in logos:
35
  f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
src/index.ts CHANGED
@@ -22,6 +22,21 @@ queue.addEventListener("idle", async () => {
22
  console.log("all done")
23
  })
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  function process(o: { url: string, bco: string, name: string }): Promise<void> {
26
  const promises: Promise<void>[] = [];
27
 
@@ -30,11 +45,22 @@ function process(o: { url: string, bco: string, name: string }): Promise<void> {
30
  promises.push(new Promise<void>((accept, _reject) => {
31
  page.once('load', async () => {
32
  try {
33
- const logos = await page.$$(selectors.logo);
 
 
 
 
 
 
34
  let annotations = '';
35
  for (const i in logos) {
36
- const bb = await logos[i].boundingBox();
37
- if (!bb) continue;
 
 
 
 
 
38
 
39
  try {
40
  await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
 
22
  console.log("all done")
23
  })
24
 
25
+ async function get_logos(page, selector): {}[] {
26
+ const logos = await page.$$(selector) || [];
27
+ for (const i in logos) {
28
+ const bb = await page.evaluate(e => {
29
+ const { x, y, width, height } = e.getBoundingClientRect();
30
+ return {
31
+ x, y, width, height, top: window.screen.top, left: window.screen.left
32
+ }
33
+ }, logos[i])
34
+ logos[i].box = bb;
35
+ }
36
+ return logos;
37
+ }
38
+
39
+
40
  function process(o: { url: string, bco: string, name: string }): Promise<void> {
41
  const promises: Promise<void>[] = [];
42
 
 
45
  promises.push(new Promise<void>((accept, _reject) => {
46
  page.once('load', async () => {
47
  try {
48
+ const imgs = await get_logos(page, selectors.img_logo);
49
+ const ids = await get_logos(page, selectors.id_logo);
50
+ const cls = await get_logos(page, selectors.class_logo);
51
+ const logos = [
52
+ ...imgs, ...ids, ...cls
53
+ ]
54
+
55
  let annotations = '';
56
  for (const i in logos) {
57
+ const bb = logos[i].box
58
+ if (!bb
59
+ || (bb.width < 10)
60
+ || (bb.height < 10)
61
+ || (bb.x + bb.width < 0)
62
+ || (bb.y + bb.height < 0)) continue;
63
+ console.log('got bb', o.bco, bb)
64
 
65
  try {
66
  await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
src/selectors.ts CHANGED
@@ -1,5 +1,7 @@
1
  export default {
2
- "logo": "img[src*=logo]",
 
 
3
  "logosbancos": "img[src*=logosbancos]",
4
  "entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
5
  "entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
 
1
  export default {
2
+ "img_logo": "img[src*=logo]",
3
+ "id_logo": "*[id*=logo]",
4
+ "class_logo": "*[class*=logo]",
5
  "logosbancos": "img[src*=logosbancos]",
6
  "entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
7
  "entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"