Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

web_reader / src /services /lm.ts

nomagick

fix: remove readerlm automatic retry

bc8dea9 unverified about 1 year ago

raw

history blame contribute delete

5.16 kB

	import { AsyncService } from 'civkit/async-service';
	import { singleton } from 'tsyringe';

	import { PageSnapshot } from './puppeteer';
	import { GlobalLogger } from './logger';
	import _ from 'lodash';
	import { AssertionFailureError } from 'civkit';
	import { LLMManager } from '../shared/services/common-llm';
	import { JSDomControl } from './jsdom';

	const tripleBackTick = '```';

	@singleton()
	export class LmControl extends AsyncService {

	logger = this.globalLogger.child({ service: this.constructor.name });

	constructor(
	protected globalLogger: GlobalLogger,
	protected commonLLM: LLMManager,
	protected jsdomControl: JSDomControl,
	) {
	super(...arguments);
	}

	override async init() {
	await this.dependencyReady();

	this.emit('ready');
	}

	async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
	pageshotUrl?: string,
	}) {
	const pageshot = snapshot?.pageshotUrl \|\| snapshot?.pageshot;

	if (!pageshot) {
	throw new AssertionFailureError('Screenshot of the page is not available');
	}

	const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');

	const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
	prompt: [
	`HTML: \n${html}\n\nSCREENSHOT: \n`,
	typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
	`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
	],

	options: {
	system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
	stream: true
	}
	});

	const chunks: string[] = [];
	for await (const txt of it) {
	chunks.push(txt);
	const output: PageSnapshot = {
	...snapshot,
	parsed: {
	...snapshot?.parsed,
	textContent: chunks.join(''),
	}
	};
	yield output;
	}

	return;
	}

	async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
	if (!snapshot) {
	throw new AssertionFailureError('Snapshot of the page is not available');
	}

	const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');

	const it = this.commonLLM.iterRun('readerlm-v2', {
	prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`,

	options: {
	// system: 'You are an AI assistant developed by VENDOR_NAME',
	stream: true,
	modelSpecific: {
	top_k: 1,
	temperature: 0,
	repetition_penalty: 1.13,
	presence_penalty: 0.25,
	frequency_penalty: 0.25,
	max_tokens: 8192,
	}
	},
	maxTry: 1,
	});

	const chunks: string[] = [];
	for await (const txt of it) {
	chunks.push(txt);
	const output: PageSnapshot = {
	...snapshot,
	parsed: {
	...snapshot?.parsed,
	textContent: chunks.join(''),
	}
	};
	yield output;
	}

	return;
	}

	async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
	if (!snapshot) {
	throw new AssertionFailureError('Snapshot of the page is not available');
	}

	const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');

	const it = this.commonLLM.iterRun('readerlm-v2', {
	prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
	options: {
	// system: 'You are an AI assistant developed by VENDOR_NAME',
	stream: true,
	modelSpecific: {
	top_k: 1,
	temperature: 0,
	repetition_penalty: 1.13,
	presence_penalty: 0.25,
	frequency_penalty: 0.25,
	max_tokens: 8192,
	}
	},
	maxTry: 1,
	});

	const chunks: string[] = [];
	for await (const txt of it) {
	chunks.push(txt);
	const output: PageSnapshot = {
	...snapshot,
	parsed: {
	...snapshot?.parsed,
	textContent: chunks.join(''),
	}
	};
	yield output;
	}

	return;
	}
	}