File size: 1,660 Bytes
faca43f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import { JSDOM, VirtualConsole } from "jsdom";

function removeTags(node: Node) {
	if (node.hasChildNodes()) {
		node.childNodes.forEach((childNode) => {
			if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") {
				node.removeChild(childNode);
			} else {
				removeTags(childNode);
			}
		});
	}
}
function naiveInnerText(node: Node): string {
	const Node = node; // We need Node(DOM's Node) for the constants, but Node doesn't exist in the nodejs global space, and any Node instance references the constants through the prototype chain
	return [...node.childNodes]
		.map((childNode) => {
			switch (childNode.nodeType) {
				case Node.TEXT_NODE:
					return node.textContent;
				case Node.ELEMENT_NODE:
					return naiveInnerText(childNode);
				default:
					return "";
			}
		})
		.join("\n");
}

export async function parseWeb(url: string) {
	const abortController = new AbortController();
	setTimeout(() => abortController.abort(), 10000);
	const htmlString = await fetch(url, { signal: abortController.signal })
		.then((response) => response.text())
		.catch((err) => console.log(err));

	const virtualConsole = new VirtualConsole();
	virtualConsole.on("error", () => {
		// No-op to skip console errors.
	});

	// put the html string into a DOM
	const dom = new JSDOM(htmlString ?? "", {
		virtualConsole,
	});

	const body = dom.window.document.querySelector("body");
	if (!body) throw new Error("body of the webpage is null");

	removeTags(body);

	// recursively extract text content from the body and then remove newlines and multiple spaces
	const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, "");

	return text;
}