File size: 1,014 Bytes
6434339
 
 
 
 
 
 
e943a05
6434339
 
 
 
 
 
 
 
 
 
 
ebac87f
 
 
 
 
 
 
6434339
ebac87f
 
6434339
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import { JSDOM, VirtualConsole } from "jsdom";

export async function parseWeb(url: string) {
	const abortController = new AbortController();
	setTimeout(() => abortController.abort(), 10000);
	const htmlString = await fetch(url, { signal: abortController.signal })
		.then((response) => response.text())
		.catch();

	const virtualConsole = new VirtualConsole();
	virtualConsole.on("error", () => {
		// No-op to skip console errors.
	});

	// put the html string into a DOM
	const dom = new JSDOM(htmlString ?? "", {
		virtualConsole,
	});

	const { document } = dom.window;
	const textElTags = "p";
	const paragraphs = document.querySelectorAll(textElTags);
	if (!paragraphs.length) {
		throw new Error(`webpage doesn't have any "${textElTags}" element`);
	}
	const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);

	// combine text contents from paragraphs and then remove newlines and multiple spaces
	const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");

	return text;
}