Spaces:

yunlonggong
/

meridian-ml-service

Running

App Files Files Community

meridian-ml-service / apps /backend /src /lib /parsers.ts

yunlonggong

Initial project upload

1b44660 2 months ago

raw

history blame contribute delete

4.98 kB

	import { Readability } from '@mozilla/readability';
	import { XMLParser } from 'fast-xml-parser';
	import { parseHTML } from 'linkedom';
	import { Result, err, ok } from 'neverthrow';
	import { z } from 'zod';

	const rssFeedSchema = z.object({
	title: z.string().min(1),
	link: z.string(),
	pubDate: z.date().nullable(),
	});

	function cleanString(text: string) {
	return text
	.replace(/[ \t]+/g, ' ') // collapse spaces/tabs
	.replace(/\n\s+/g, '\n') // clean spaces after newlines
	.replace(/\s+\n/g, '\n') // clean spaces before newlines
	.replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines
	.trim(); // clean edges
	}

	function cleanUrl(url: string) {
	const u = new URL(url);

	const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid'];
	for (const param of paramsToRemove) {
	u.searchParams.delete(param);
	}

	return u.toString();
	}

	/**
	* Parses an RSS/XML feed content to extract article information
	*
	* Handles various RSS feed formats and structures while normalizing the output.
	* Extracts titles, links, and publication dates from the feed items.
	*
	* @param xml The XML content of the RSS feed as a string
	* @returns A Result containing either an array of parsed feed items or an error
	*/
	export async function parseRSSFeed(xml: string): Promise<Result<z.infer<typeof rssFeedSchema>[], Error>> {
	const safeParser = Result.fromThrowable(
	(xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml),
	e => (e instanceof Error ? e : new Error(String(e)))
	);

	const parsedXml = safeParser(xml);
	if (parsedXml.isErr()) {
	return err(new Error(`Parse error: ${parsedXml.error.message}`));
	}

	const result = parsedXml.value;

	// handle various feed structures
	let items = result.rss?.channel?.item \|\| result.feed?.entry \|\| result.item \|\| result['rdf:RDF']?.item \|\| [];

	// handle single item case
	items = Array.isArray(items) ? items : [items];

	// biome-ignore lint/suspicious/noExplicitAny: <explanation>
	const properItems = items.map((item: any) => {
	let title = '';
	let link = '';
	let id = '';
	let pubDateString: string \| null = null;

	if (typeof item.title === 'string') {
	title = item.title;
	} else if (typeof item.title === 'object' && item.title['#text']) {
	title = item.title['#text'];
	} else {
	title = 'UNKNOWN';
	}

	if (typeof item.link === 'string') {
	link = item.link;
	} else if (typeof item.link === 'object' && item.link['@_href']) {
	link = item.link['@_href'];
	} else if (typeof item.guid === 'string') {
	link = item.guid;
	} else {
	link = 'UNKNOWN';
	}

	if (typeof item.guid === 'string') {
	id = item.guid;
	} else if (typeof item.guid === 'object' && item.guid['#text']) {
	id = item.guid['#text'];
	} else {
	id = 'UNKNOWN';
	}

	if (typeof item.pubDate === 'string') {
	pubDateString = item.pubDate;
	} else if (typeof item.published === 'string') {
	pubDateString = item.published;
	} else if (typeof item.updated === 'string') {
	pubDateString = item.updated;
	}

	let pubDate: Date \| null = null;
	if (pubDateString) {
	pubDate = new Date(pubDateString);
	if (Number.isNaN(pubDate.getTime())) {
	pubDate = null;
	}
	}

	return {
	title: cleanString(title),
	link: cleanUrl(cleanString(link)),
	id: cleanString(id),
	pubDate,
	};
	});

	// standardize the items
	const parsedItems = z.array(rssFeedSchema).safeParse(properItems);
	if (parsedItems.success === false) {
	return err(new Error(`Validation error: ${parsedItems.error.message}`));
	}

	return ok(parsedItems.data);
	}

	/**
	* Parses HTML content to extract article text and metadata
	*
	* Uses Mozilla Readability to identify and extract the main content
	* from an HTML document, ignoring navigation, ads, and other non-content elements.
	*
	* @param opts Object containing the HTML content to parse
	* @returns A Result containing either the parsed article data or an error object
	*/
	export function parseArticle(opts: { html: string }) {
	const safeReadability = Result.fromThrowable(
	(html: string) => new Readability(parseHTML(html).document).parse(),
	e => (e instanceof Error ? e : new Error(String(e)))
	);

	const articleResult = safeReadability(opts.html);
	if (articleResult.isErr()) {
	return err({ type: 'READABILITY_ERROR', error: articleResult.error });
	}

	// if we can't parse the article or there is no article, not much we can do
	const article = articleResult.value;
	if (article === null \|\| !article.title \|\| !article.textContent) {
	return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') });
	}

	return ok({
	title: article.title,
	text: cleanString(article.textContent),
	publishedTime: article.publishedTime \|\| undefined,
	});
	}