Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	| import { JSDOM, VirtualConsole } from "jsdom"; | |
| export async function parseWeb(url: string) { | |
| const abortController = new AbortController(); | |
| setTimeout(() => abortController.abort(), 10000); | |
| const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch(); | |
| if (r.headers.get("content-type")?.includes("text/html")) { | |
| const virtualConsole = new VirtualConsole(); | |
| virtualConsole.on("error", () => { | |
| // No-op to skip console errors. | |
| }); | |
| // put the html string into a DOM | |
| const dom = new JSDOM((await r.text()) ?? "", { | |
| virtualConsole, | |
| }); | |
| const { document } = dom.window; | |
| const paragraphs = document.querySelectorAll("p, table, pre, ul, ol"); | |
| if (!paragraphs.length) { | |
| throw new Error(`webpage doesn't have any parseable element`); | |
| } | |
| const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent); | |
| // combine text contents from paragraphs and then remove newlines and multiple spaces | |
| const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, ""); | |
| return text; | |
| } else if ( | |
| r.headers.get("content-type")?.includes("text/plain") || | |
| r.headers.get("content-type")?.includes("text/markdown") | |
| ) { | |
| return r.text(); | |
| } else { | |
| throw new Error("Unsupported content type"); | |
| } | |
| } | |