Spaces:
Build error
Build error
| import { AsyncService } from 'civkit/async-service'; | |
| import { singleton } from 'tsyringe'; | |
| import { PageSnapshot } from './puppeteer'; | |
| import { GlobalLogger } from './logger'; | |
| import _ from 'lodash'; | |
| import { AssertionFailureError } from 'civkit'; | |
| import { LLMManager } from '../shared/services/common-llm'; | |
| import { JSDomControl } from './jsdom'; | |
| const tripleBackTick = '```'; | |
| () | |
| export class LmControl extends AsyncService { | |
| logger = this.globalLogger.child({ service: this.constructor.name }); | |
| constructor( | |
| protected globalLogger: GlobalLogger, | |
| protected commonLLM: LLMManager, | |
| protected jsdomControl: JSDomControl, | |
| ) { | |
| super(...arguments); | |
| } | |
| override async init() { | |
| await this.dependencyReady(); | |
| this.emit('ready'); | |
| } | |
| async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & { | |
| pageshotUrl?: string, | |
| }) { | |
| const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot; | |
| if (!pageshot) { | |
| throw new AssertionFailureError('Screenshot of the page is not available'); | |
| } | |
| const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); | |
| const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', { | |
| prompt: [ | |
| `HTML: \n${html}\n\nSCREENSHOT: \n`, | |
| typeof pageshot === 'string' ? new URL(pageshot) : pageshot, | |
| `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`, | |
| ], | |
| options: { | |
| system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed', | |
| stream: true | |
| } | |
| }); | |
| const chunks: string[] = []; | |
| for await (const txt of it) { | |
| chunks.push(txt); | |
| const output: PageSnapshot = { | |
| ...snapshot, | |
| parsed: { | |
| ...snapshot?.parsed, | |
| textContent: chunks.join(''), | |
| } | |
| }; | |
| yield output; | |
| } | |
| return; | |
| } | |
| async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) { | |
| if (!snapshot) { | |
| throw new AssertionFailureError('Snapshot of the page is not available'); | |
| } | |
| const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); | |
| const it = this.commonLLM.iterRun('readerlm-v2', { | |
| prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`, | |
| options: { | |
| // system: 'You are an AI assistant developed by VENDOR_NAME', | |
| stream: true, | |
| modelSpecific: { | |
| top_k: 1, | |
| temperature: 0, | |
| repetition_penalty: 1.13, | |
| presence_penalty: 0.25, | |
| frequency_penalty: 0.25, | |
| max_tokens: 8192, | |
| } | |
| }, | |
| maxTry: 1, | |
| }); | |
| const chunks: string[] = []; | |
| for await (const txt of it) { | |
| chunks.push(txt); | |
| const output: PageSnapshot = { | |
| ...snapshot, | |
| parsed: { | |
| ...snapshot?.parsed, | |
| textContent: chunks.join(''), | |
| } | |
| }; | |
| yield output; | |
| } | |
| return; | |
| } | |
| async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) { | |
| if (!snapshot) { | |
| throw new AssertionFailureError('Snapshot of the page is not available'); | |
| } | |
| const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); | |
| const it = this.commonLLM.iterRun('readerlm-v2', { | |
| prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`, | |
| options: { | |
| // system: 'You are an AI assistant developed by VENDOR_NAME', | |
| stream: true, | |
| modelSpecific: { | |
| top_k: 1, | |
| temperature: 0, | |
| repetition_penalty: 1.13, | |
| presence_penalty: 0.25, | |
| frequency_penalty: 0.25, | |
| max_tokens: 8192, | |
| } | |
| }, | |
| maxTry: 1, | |
| }); | |
| const chunks: string[] = []; | |
| for await (const txt of it) { | |
| chunks.push(txt); | |
| const output: PageSnapshot = { | |
| ...snapshot, | |
| parsed: { | |
| ...snapshot?.parsed, | |
| textContent: chunks.join(''), | |
| } | |
| }; | |
| yield output; | |
| } | |
| return; | |
| } | |
| } | |