File size: 1,266 Bytes
6434339
 
 
 
 
3f5871c
6434339
3f5871c
 
 
 
 
6434339
3f5871c
 
 
 
6434339
3f5871c
 
 
 
 
 
 
6434339
3f5871c
 
6434339
3f5871c
 
 
 
 
 
 
 
 
6434339
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import { JSDOM, VirtualConsole } from "jsdom";

export async function parseWeb(url: string) {
	const abortController = new AbortController();
	setTimeout(() => abortController.abort(), 10000);
	const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch();

	if (r.headers.get("content-type")?.includes("text/html")) {
		const virtualConsole = new VirtualConsole();
		virtualConsole.on("error", () => {
			// No-op to skip console errors.
		});

		// put the html string into a DOM
		const dom = new JSDOM((await r.text()) ?? "", {
			virtualConsole,
		});

		const { document } = dom.window;
		const paragraphs = document.querySelectorAll("p, table, pre, ul, ol");

		if (!paragraphs.length) {
			throw new Error(`webpage doesn't have any parseable element`);
		}
		const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);

		// combine text contents from paragraphs and then remove newlines and multiple spaces
		const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");

		return text;
	} else if (
		r.headers.get("content-type")?.includes("text/plain") ||
		r.headers.get("content-type")?.includes("text/markdown")
	) {
		return r.text();
	} else {
		throw new Error("Unsupported content type");
	}
}