Niv Sardi commited on
Commit
dc89ab8
1 Parent(s): 68ee7bd

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

Dockerfile.deno ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM docker.io/denoland/deno
2
+ MAINTAINER Niv Sardi <x@filtra.me>
3
+ WORKDIR /app
4
+
5
+ COPY src-deno ./src
6
+ RUN deno cache ./src/index.ts
7
+
8
+ CMD deno ./src/index.ts
docker-compose.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9" # optional since v1.27.0
2
+ services:
3
+ puppet:
4
+ build:
5
+ dockerfile: Dockerfile.deno
6
+ context: .
7
+ links:
8
+ - browserless
9
+ environment:
10
+ BROWSERLESS_HOST: browserless
11
+ BROWSERLESS_PORT: 3000
12
+ DEBUG: "puppet"
13
+ depends_on:
14
+ - "browserless"
15
+ command: "sh -c 'while echo deno; do sleep 3h; done'" #"deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
16
+ volumes:
17
+ - "./src-deno:/app/src:z"
18
+ - "./data:/app/data:z"
19
+ #restart: unless-stopped:600
20
+ deploy:
21
+ restart_policy:
22
+ condition: any
23
+ delay: 600s
24
+ window: 300s
25
+
26
+ browserless:
27
+ image: docker.io/zenika/alpine-chrome
28
+ entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
29
+ port:
30
+ - "3000:3000"
31
+
src/csv.test.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ assertEquals,
3
+ assertObjectMatch
4
+ } from "https://deno.land/std@0.152.0/testing/asserts.ts";
5
+ import * as CSV from './csv.ts';
6
+
7
+ Deno.test("ParseLine", () => {
8
+ assertEquals(CSV.parseLine('"test", "test, with", without'), ['test', 'test, with', 'without'])
9
+ })
10
+ Deno.test("ParseCSV", () => {
11
+ const res: object[] = []
12
+ const expected = { test: 'hello', case: 'world' }
13
+ CSV.parse('test,case\nhello,world', e => res.push(e))
14
+ assertObjectMatch(res[0], expected)
15
+ assertEquals(res, [expected])
16
+ })
src/csv.ts ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export function parseLine(l: string) {
2
+ const res = l.match(/((?:\s+"[^"]+")|(?:[^,"]+))/g) || [];
3
+ for (let i = 0; i < res.length; i++) {
4
+ res[i] = res[i].replace(/^\s+/, '').replace(/^"/, '').replace(/[\r\n"]+$/, '')
5
+ }
6
+ return res;
7
+ }
8
+
9
+ export function parse(t: string, cb: (o: object) => void) {
10
+ const lines = t.split('\n');
11
+ const header = parseLine(lines[0]);
12
+ for (let i = 1; i < lines.length; i++) {
13
+ if (!lines[i].length) {
14
+ continue;
15
+ }
16
+ const l = parseLine(lines[i]) || []
17
+
18
+ if (l.length < header.length) {
19
+ console.error(`couldn't parse '${lines[i]}' yielded '${l}' of length ${l.length} expected ${header.length}: ${header}`);
20
+ return null;
21
+ }
22
+ const e = { [header[0]]: l[0] };
23
+ for (let j = 1; j < header.length; j++) {
24
+ e[`${header[j]}`] = l[j];
25
+ }
26
+ cb(e)
27
+ }
28
+ }
src/img.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ import * as opencv from "https://deno.land/x/opencv@v4.3.0-10/mod.ts";
src/index.ts ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PQueue from "https://deno.land/x/p_queue@1.0.1/mod.ts"
2
+
3
+ import * as CSV from './csv.ts';
4
+ import Puppet from './puppet.ts';
5
+ import selectors from './selectors.ts';
6
+
7
+ const puppet = new Puppet();
8
+ const queue = new PQueue({
9
+ concurrency: 10,
10
+ timeout: 60000
11
+ })
12
+ let count = 0
13
+ queue.addEventListener("active", () =>
14
+ console.log(`Working on item #${++count}. Size: ${queue.size} Pending: ${queue.pending}`))
15
+ queue.addEventListener("next", () =>
16
+ console.log(`task finished, Size: ${queue.size} Pending: ${queue.pending}`))
17
+ const statInterval = setInterval(() =>
18
+ console.log(`Size: ${queue.size} Pending: ${queue.pending}`), 1000);
19
+
20
+ queue.addEventListener("idle", async () => {
21
+ clearInterval(statInterval)
22
+ await puppet.close()
23
+ console.log("all done")
24
+ })
25
+
26
+ function process(o: { url: string, bco: string, name: string }): Promise<void> {
27
+ const promises: Promise<void>[] = [];
28
+
29
+ return puppet.run(async page => {
30
+ const url = o.url.replace('http:', 'https:');
31
+ promises.push(new Promise<void>((accept, _reject) => {
32
+ page.once('load', async () => {
33
+ try {
34
+ const logos = await page.$$(selectors.logo);
35
+ let annotations = '';
36
+ for (const i in logos) {
37
+ const bb = await logos[i].boundingBox();
38
+ if (!bb) continue;
39
+
40
+ try {
41
+ await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
42
+ annotations +=
43
+ `${o.bco} ${bb.x + bb.width / 2} ${bb.y + bb.height / 2} ${bb.width} ${bb.height}\n`
44
+ } catch (e) {
45
+ console.error(`couldn't screenshot logo: ${e}`);
46
+ }
47
+ }
48
+ if (logos.length) {
49
+ await Deno.writeTextFile(`./data/${o.bco}.chrome.full.txt`, annotations);
50
+ }
51
+ await page.screenshot({ path: `./data/${o.bco}.chrome.full.png`, fullPage: true })
52
+ console.log(`screenshot ok for ${o.name}`);
53
+ } catch (err) {
54
+ console.error(`error in screenshot: ${err}`);
55
+ }
56
+ accept()
57
+ })
58
+ }))
59
+
60
+ try {
61
+ await page.goto(url)
62
+ .catch(() => page.goto(o.url))
63
+ } catch (e) {
64
+ console.error(`got error: ${e}`);
65
+ }
66
+ await Promise.all(promises);
67
+ })
68
+ }
69
+
70
+ const text = await Deno.readTextFile("./data/entidades.csv");
71
+ CSV.parse(text, o => queue.add(() => process(o)))
src/puppet.test.ts ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import { assertEquals } from "https://deno.land/std@0.152.0/testing/asserts.ts";
2
+ import Puppet from './puppet.ts'
3
+
4
+ Deno.test("Puppet", async () => {
5
+ const P = new Puppet()
6
+ await P.connect()
7
+ await P.run(page => page.goto("https://google.com"))
8
+ await P.close()
9
+ })
src/puppet.ts ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Puppeteer from "https://deno.land/x/puppeteer@14.1.1/mod.ts";
2
+ import EventEmitter from "https://deno.land/x/events@v1.0.0/mod.ts";
3
+ import type { Browser, Page } from "https://deno.land/x/puppeteer@14.1.1/mod.ts";
4
+
5
+ const BROWSER_SIGNALS = [
6
+ 'disconnected',
7
+ 'targetchanged',
8
+ 'targetcreated',
9
+ 'targetdestroyed'
10
+ ];
11
+
12
+ const CHROME_ARGS = [
13
+ '--no-sandbox',
14
+ '--disable-setuid-sandbox'
15
+ ];
16
+
17
+ async function resolve(a: string) {
18
+ if (a.match(/(\d.?){4}/)) {
19
+ return a;
20
+ }
21
+ return await Deno.resolveDns(a, "A");
22
+ }
23
+
24
+ export default class Runner extends EventEmitter {
25
+ config: {
26
+ BROWSERLESS_HOST: string;
27
+ BROWSERLESS_PORT: string;
28
+ };
29
+ target: string;
30
+ browser: Browser | undefined;
31
+ connected: Promise<boolean> | undefined;
32
+
33
+ constructor(config = {
34
+ BROWSERLESS_HOST: Deno.env.get("BROWSERLESS_HOST") || "localhost",
35
+ BROWSERLESS_PORT: Deno.env.get("BROWSERLESS_PORT") || "3000",
36
+ }) {
37
+ super();
38
+ this.target = `ws://${config.BROWSERLESS_HOST}:${config.BROWSERLESS_PORT}`;
39
+ this.config = config;
40
+ this.connected
41
+ }
42
+ public async close() {
43
+ try {
44
+ if (this.browser) await this.browser.close();
45
+ } catch (err) {
46
+ console.error(`${err} on close`)
47
+ }
48
+ }
49
+ async connect() {
50
+ if (!this.connected)
51
+ this.connected = this._connect()
52
+ return this.connected
53
+ }
54
+ async _connect() {
55
+ try {
56
+ const host = await resolve(this.config.BROWSERLESS_HOST);
57
+ const ver = await fetch(`http://${host}:${this.config.BROWSERLESS_PORT}/json/version`)
58
+ .then(async res => await res.json())
59
+ this.target = ver.webSocketDebuggerUrl;
60
+ this.browser = this.browser || await Puppeteer.connect({
61
+ browserWSEndpoint: this.target
62
+ }).catch(() => {
63
+ console.error(`
64
+ ⚠ COULD NOT CONNECT TO BROWSERLESS
65
+ 🦄 will try to spawn a chromedriver instance for you to debug`)
66
+ return Puppeteer.launch({
67
+ args: CHROME_ARGS,
68
+ headless: false
69
+ })
70
+ });
71
+
72
+ if (!this.browser) {
73
+ console.error("couldn't init Browser");
74
+ return false;
75
+ }
76
+ BROWSER_SIGNALS.map(e => this.browser?.on(e, d => this.emit(`browser:${e}`, d)))
77
+ this.browser.on('error', e => console.error(`got browser error: ${e}`))
78
+
79
+ const pages = await this.browser.pages();
80
+ for (let p in pages) {
81
+ await pages[p].close();
82
+ }
83
+ this.emit("ready")
84
+ } catch (e) {
85
+ console.error(e);
86
+ }
87
+ return true;
88
+ }
89
+
90
+ public async run(fn: (page: Page) => void) {
91
+ await this.connect();
92
+
93
+ if (!this.browser) {
94
+ return;
95
+ }
96
+ try {
97
+ const page = await this.browser.newPage()
98
+ if (!page) {
99
+ return;
100
+ }
101
+ const ret = await fn(page)
102
+ await page.close()
103
+ return ret
104
+ } catch (e) {
105
+ return
106
+ }
107
+ }
108
+ }
src/selectors.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export default {
2
+ "logo": "img[src*=logo]",
3
+ "logosbancos": "img[src*=logosbancos]",
4
+ "entity_http": "p.post-pagina-interior a[target=_blank][href*=http]",
5
+ "entity_mailto": "p.post-pagina-interior a[target=_blank][href*=mailto]"
6
+ }