Spaces:
Runtime error
Runtime error
File size: 3,299 Bytes
dc89ab8 9f5528c dc89ab8 fc32112 dc89ab8 fc32112 dc89ab8 fc32112 dc89ab8 9f5528c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import PQueue from "https://deno.land/x/p_queue@1.0.1/mod.ts"
import * as CSV from './csv.ts';
import Puppet from './puppet.ts';
import selectors from './selectors.ts';
const puppet = new Puppet();
const queue = new PQueue({
concurrency: 10,
timeout: 60000
})
let count = 0
let statInterval
queue.addEventListener("active", () =>
console.log(`Working on item #${++count}. Size: ${queue.size} Pending: ${queue.pending}`))
queue.addEventListener("next", () =>
console.log(`task finished, Size: ${queue.size} Pending: ${queue.pending}`))
queue.addEventListener("idle", async () => {
clearInterval(statInterval)
await puppet.close()
console.log("all done")
})
async function get_logos(page, selector): {}[] {
const logos = await page.$$(selector) || [];
for (const i in logos) {
const bb = await page.evaluate(e => {
const { x, y, width, height } = e.getBoundingClientRect();
return {
x, y, width, height, top: window.screen.top, left: window.screen.left
}
}, logos[i])
logos[i].box = bb;
}
return logos;
}
function process(o: { url: string, bco: string, name: string }): Promise<void> {
const promises: Promise<void>[] = [];
return puppet.run(async page => {
const url = o.url.replace('http:', 'https:');
promises.push(new Promise<void>((accept, _reject) => {
page.once('load', async () => {
try {
const imgs = await get_logos(page, selectors.img_logo);
const ids = await get_logos(page, selectors.id_logo);
const cls = await get_logos(page, selectors.class_logo);
const logos = [
...imgs, ...ids, ...cls
]
let annotations = '';
for (const i in logos) {
const bb = logos[i].box
if (!bb
|| (bb.width < 10)
|| (bb.height < 10)
|| (bb.x + bb.width < 0)
|| (bb.y + bb.height < 0)) continue;
console.log('got bb', o.bco, bb)
try {
await logos[i].screenshot({ path: `./data/logos/${o.bco}.logo${i}.png` })
annotations +=
`${o.bco} ${bb.x + bb.width / 2} ${bb.y + bb.height / 2} ${bb.width} ${bb.height}\n`
} catch (e) {
console.error(`couldn't screenshot logo: ${e}`);
}
}
if (logos.length) {
await Deno.writeTextFile(`./data/${o.bco}.chrome.full.txt`, annotations);
}
await page.screenshot({ path: `./data/${o.bco}.chrome.full.png`, fullPage: true })
console.log(`screenshot ok for ${o.name}`);
} catch (err) {
console.error(`error in screenshot: ${err}`);
}
accept()
})
}))
try {
await page.goto(url)
.catch(() => page.goto(o.url))
} catch (e) {
console.error(`got error: ${e}`);
}
await Promise.all(promises);
})
}
async function run() {
let text;
try {
text = await Deno.readTextFile("./data/entidades.csv")
} catch (e) {
console.error(`couldn't read csv: ${e}`)
}
if (!text) return setTimeout(run, 1000)
statInterval = setInterval(() =>
console.log(`Size: ${queue.size} Pending: ${queue.pending}`), 1000);
CSV.parse(text, o => queue.add(() => process(o)))
}
run()
|