import { HfInference } from "@huggingface/inference"; export const LLM_CONFIG = { /* Hugginface config: */ ollama: false, huggingface: true, url: "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct", chatModel: "meta-llama/Meta-Llama-3-8B-Instruct", embeddingModel: "https://api-inference.huggingface.co/models/mixedbread-ai/mxbai-embed-large-v1", embeddingDimension: 1024, /* Ollama (local) config: */ // ollama: true, // url: 'http://127.0.0.1:11434', // chatModel: 'llama3' as const, // embeddingModel: 'mxbai-embed-large', // embeddingDimension: 1024, // embeddingModel: 'llama3', // embeddingDimension: 4096, /* Together.ai config: ollama: false, url: 'https://api.together.xyz', chatModel: 'meta-llama/Llama-3-8b-chat-hf', embeddingModel: 'togethercomputer/m2-bert-80M-8k-retrieval', embeddingDimension: 768, */ /* OpenAI config: ollama: false, url: 'https://api.openai.com', chatModel: 'gpt-3.5-turbo-16k', embeddingModel: 'text-embedding-ada-002', embeddingDimension: 1536, */ }; function apiUrl(path: string) { // OPENAI_API_BASE and OLLAMA_HOST are legacy const host = process.env.LLM_API_URL ?? process.env.OLLAMA_HOST ?? process.env.OPENAI_API_BASE ?? LLM_CONFIG.url; if (host.endsWith("/") && path.startsWith("/")) { return host + path.slice(1); } else if (!host.endsWith("/") && !path.startsWith("/")) { return host + "/" + path; } else { return host + path; } } function apiKey() { return process.env.LLM_API_KEY ?? process.env.OPENAI_API_KEY; } const AuthHeaders = (): Record => apiKey() ? { Authorization: "Bearer " + apiKey(), } : {}; // Overload for non-streaming export async function chatCompletion( body: Omit & { model?: CreateChatCompletionRequest["model"]; } & { stream?: false | null | undefined; } ): Promise<{ content: string; retries: number; ms: number }>; // Overload for streaming export async function chatCompletion( body: Omit & { model?: CreateChatCompletionRequest["model"]; } & { stream?: true; } ): Promise<{ content: ChatCompletionContent; retries: number; ms: number }>; export async function chatCompletion( body: Omit & { model?: CreateChatCompletionRequest["model"]; } ) { assertApiKey(); // OLLAMA_MODEL is legacy body.model = body.model ?? process.env.LLM_MODEL ?? process.env.OLLAMA_MODEL ?? LLM_CONFIG.chatModel; const stopWords = body.stop ? typeof body.stop === "string" ? [body.stop] : body.stop : []; if (LLM_CONFIG.ollama || LLM_CONFIG.huggingface) stopWords.push("<|eot_id|>"); const { result: content, retries, ms, } = await retryWithBackoff(async () => { const hf = new HfInference(apiKey()); const model = hf.endpoint(apiUrl("/v1/chat/completions")); if (body.stream) { const completion = model.chatCompletionStream({ ...body, }); return new ChatCompletionContent(completion, stopWords); } else { const completion = await model.chatCompletion({ ...body, }); const content = completion.choices[0].message?.content; if (content === undefined) { throw new Error( "Unexpected result from OpenAI: " + JSON.stringify(completion) ); } return content; } }); return { content, retries, ms, }; } export async function tryPullOllama(model: string, error: string) { if (error.includes("try pulling")) { console.error("Embedding model not found, pulling from Ollama"); const pullResp = await fetch(apiUrl("/api/pull"), { method: "POST", headers: { "Content-Type": "application/json", }, body: JSON.stringify({ name: model }), }); console.log("Pull response", await pullResp.text()); throw { retry: true, error: `Dynamically pulled model. Original error: ${error}`, }; } } export async function fetchEmbeddingBatch(texts: string[]) { if (LLM_CONFIG.ollama) { return { ollama: true as const, embeddings: await Promise.all( texts.map(async (t) => (await ollamaFetchEmbedding(t)).embedding) ), }; } assertApiKey(); if (LLM_CONFIG.huggingface) { const result = await fetch(LLM_CONFIG.embeddingModel, { method: "POST", headers: { "Content-Type": "application/json", "X-Wait-For-Model": "true", ...AuthHeaders(), }, body: JSON.stringify({ inputs: texts.map((text) => text.replace(/\n/g, " ")), }), }); const embeddings = await result.json(); return { ollama: true as const, embeddings: embeddings, }; } const { result: json, retries, ms, } = await retryWithBackoff(async () => { const result = await fetch(apiUrl("/v1/embeddings"), { method: "POST", headers: { "Content-Type": "application/json", ...AuthHeaders(), }, body: JSON.stringify({ model: LLM_CONFIG.embeddingModel, input: texts.map((text) => text.replace(/\n/g, " ")), }), }); if (!result.ok) { throw { retry: result.status === 429 || result.status >= 500, error: new Error( `Embedding failed with code ${result.status}: ${await result.text()}` ), }; } return (await result.json()) as CreateEmbeddingResponse; }); if (json.data.length !== texts.length) { console.error(json); throw new Error("Unexpected number of embeddings"); } const allembeddings = json.data; allembeddings.sort((a, b) => a.index - b.index); return { ollama: false as const, embeddings: allembeddings.map(({ embedding }) => embedding), usage: json.usage?.total_tokens, retries, ms, }; } export async function fetchEmbedding(text: string) { const { embeddings, ...stats } = await fetchEmbeddingBatch([text]); return { embedding: embeddings[0], ...stats }; } export async function fetchModeration(content: string) { assertApiKey(); const { result: flagged } = await retryWithBackoff(async () => { const result = await fetch(apiUrl("/v1/moderations"), { method: "POST", headers: { "Content-Type": "application/json", ...AuthHeaders(), }, body: JSON.stringify({ input: content, }), }); if (!result.ok) { throw { retry: result.status === 429 || result.status >= 500, error: new Error( `Embedding failed with code ${result.status}: ${await result.text()}` ), }; } return (await result.json()) as { results: { flagged: boolean }[] }; }); return flagged; } export function assertApiKey() { if (!LLM_CONFIG.ollama && !apiKey()) { throw new Error( "\n Missing LLM_API_KEY in environment variables.\n\n" + (LLM_CONFIG.ollama ? "just" : "npx") + " convex env set LLM_API_KEY 'your-key'" ); } } // Retry after this much time, based on the retry number. const RETRY_BACKOFF = [1000, 10_000, 20_000]; // In ms const RETRY_JITTER = 100; // In ms type RetryError = { retry: boolean; error: any }; export async function retryWithBackoff( fn: () => Promise ): Promise<{ retries: number; result: T; ms: number }> { let i = 0; for (; i <= RETRY_BACKOFF.length; i++) { try { const start = Date.now(); const result = await fn(); const ms = Date.now() - start; return { result, retries: i, ms }; } catch (e) { const retryError = e as RetryError; if (i < RETRY_BACKOFF.length) { if (retryError.retry) { console.log( `Attempt ${i + 1} failed, waiting ${ RETRY_BACKOFF[i] }ms to retry...`, Date.now() ); await new Promise((resolve) => setTimeout(resolve, RETRY_BACKOFF[i] + RETRY_JITTER * Math.random()) ); continue; } } if (retryError.error) throw retryError.error; else throw e; } } throw new Error("Unreachable"); } // Lifted from openai's package export interface LLMMessage { /** * The contents of the message. `content` is required for all messages, and may be * null for assistant messages with function calls. */ content: string | null; /** * The role of the messages author. One of `system`, `user`, `assistant`, or * `function`. */ role: "system" | "user" | "assistant" | "function"; /** * The name of the author of this message. `name` is required if role is * `function`, and it should be the name of the function whose response is in the * `content`. May contain a-z, A-Z, 0-9, and underscores, with a maximum length of * 64 characters. */ name?: string; /** * The name and arguments of a function that should be called, as generated by the model. */ function_call?: { // The name of the function to call. name: string; /** * The arguments to call the function with, as generated by the model in * JSON format. Note that the model does not always generate valid JSON, * and may hallucinate parameters not defined by your function schema. * Validate the arguments in your code before calling your function. */ arguments: string; }; } // Non-streaming chat completion response interface CreateChatCompletionResponse { id: string; object: string; created: number; model: string; choices: { index?: number; message?: { role: "system" | "user" | "assistant"; content: string; }; finish_reason?: string; }[]; usage?: { completion_tokens: number; prompt_tokens: number; total_tokens: number; }; } interface CreateEmbeddingResponse { data: { index: number; object: string; embedding: number[]; }[]; model: string; object: string; usage: { prompt_tokens: number; total_tokens: number; }; } export interface CreateChatCompletionRequest { /** * ID of the model to use. * @type {string} * @memberof CreateChatCompletionRequest */ model: string; // | 'gpt-4' // | 'gpt-4-0613' // | 'gpt-4-32k' // | 'gpt-4-32k-0613' // | 'gpt-3.5-turbo' // | 'gpt-3.5-turbo-0613' // | 'gpt-3.5-turbo-16k' // <- our default // | 'gpt-3.5-turbo-16k-0613'; /** * The messages to generate chat completions for, in the chat format: * https://platform.openai.com/docs/guides/chat/introduction * @type {Array} * @memberof CreateChatCompletionRequest */ messages: LLMMessage[]; /** * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both. * @type {number} * @memberof CreateChatCompletionRequest */ temperature?: number | null; /** * An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. We generally recommend altering this or `temperature` but not both. * @type {number} * @memberof CreateChatCompletionRequest */ top_p?: number | null; /** * How many chat completion choices to generate for each input message. * @type {number} * @memberof CreateChatCompletionRequest */ n?: number | null; /** * If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. * @type {boolean} * @memberof CreateChatCompletionRequest */ stream?: boolean | null; /** * * @type {CreateChatCompletionRequestStop} * @memberof CreateChatCompletionRequest */ stop?: Array | string; /** * The maximum number of tokens allowed for the generated answer. By default, * the number of tokens the model can return will be (4096 - prompt tokens). * @type {number} * @memberof CreateChatCompletionRequest */ max_tokens?: number; /** * Number between -2.0 and 2.0. Positive values penalize new tokens based on * whether they appear in the text so far, increasing the model\'s likelihood * to talk about new topics. See more information about frequency and * presence penalties: * https://platform.openai.com/docs/api-reference/parameter-details * @type {number} * @memberof CreateChatCompletionRequest */ presence_penalty?: number | null; /** * Number between -2.0 and 2.0. Positive values penalize new tokens based on * their existing frequency in the text so far, decreasing the model\'s * likelihood to repeat the same line verbatim. See more information about * presence penalties: * https://platform.openai.com/docs/api-reference/parameter-details * @type {number} * @memberof CreateChatCompletionRequest */ frequency_penalty?: number | null; /** * Modify the likelihood of specified tokens appearing in the completion. * Accepts a json object that maps tokens (specified by their token ID in the * tokenizer) to an associated bias value from -100 to 100. Mathematically, * the bias is added to the logits generated by the model prior to sampling. * The exact effect will vary per model, but values between -1 and 1 should * decrease or increase likelihood of selection; values like -100 or 100 * should result in a ban or exclusive selection of the relevant token. * @type {object} * @memberof CreateChatCompletionRequest */ logit_bias?: object | null; /** * A unique identifier representing your end-user, which can help OpenAI to * monitor and detect abuse. Learn more: * https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids * @type {string} * @memberof CreateChatCompletionRequest */ user?: string; tools?: { // The type of the tool. Currently, only function is supported. type: "function"; function: { /** * The name of the function to be called. Must be a-z, A-Z, 0-9, or * contain underscores and dashes, with a maximum length of 64. */ name: string; /** * A description of what the function does, used by the model to choose * when and how to call the function. */ description?: string; /** * The parameters the functions accepts, described as a JSON Schema * object. See the guide[1] for examples, and the JSON Schema reference[2] * for documentation about the format. * [1]: https://platform.openai.com/docs/guides/gpt/function-calling * [2]: https://json-schema.org/understanding-json-schema/ * To describe a function that accepts no parameters, provide the value * {"type": "object", "properties": {}}. */ parameters: object; }; }[]; /** * Controls which (if any) function is called by the model. `none` means the * model will not call a function and instead generates a message. * `auto` means the model can pick between generating a message or calling a * function. Specifying a particular function via * {"type: "function", "function": {"name": "my_function"}} forces the model * to call that function. * * `none` is the default when no functions are present. * `auto` is the default if functions are present. */ tool_choice?: | "none" // none means the model will not call a function and instead generates a message. | "auto" // auto means the model can pick between generating a message or calling a function. // Specifies a tool the model should use. Use to force the model to call // a specific function. | { // The type of the tool. Currently, only function is supported. type: "function"; function: { name: string }; }; // Replaced by "tools" // functions?: { // /** // * The name of the function to be called. Must be a-z, A-Z, 0-9, or // * contain underscores and dashes, with a maximum length of 64. // */ // name: string; // /** // * A description of what the function does, used by the model to choose // * when and how to call the function. // */ // description?: string; // /** // * The parameters the functions accepts, described as a JSON Schema // * object. See the guide[1] for examples, and the JSON Schema reference[2] // * for documentation about the format. // * [1]: https://platform.openai.com/docs/guides/gpt/function-calling // * [2]: https://json-schema.org/understanding-json-schema/ // * To describe a function that accepts no parameters, provide the value // * {"type": "object", "properties": {}}. // */ // parameters: object; // }[]; // /** // * Controls how the model responds to function calls. "none" means the model // * does not call a function, and responds to the end-user. "auto" means the // * model can pick between an end-user or calling a function. Specifying a // * particular function via {"name":\ "my_function"} forces the model to call // * that function. // * - "none" is the default when no functions are present. // * - "auto" is the default if functions are present. // */ // function_call?: 'none' | 'auto' | { name: string }; /** * An object specifying the format that the model must output. * * Setting to { "type": "json_object" } enables JSON mode, which guarantees * the message the model generates is valid JSON. * *Important*: when using JSON mode, you must also instruct the model to * produce JSON yourself via a system or user message. Without this, the model * may generate an unending stream of whitespace until the generation reaches * the token limit, resulting in a long-running and seemingly "stuck" request. * Also note that the message content may be partially cut off if * finish_reason="length", which indicates the generation exceeded max_tokens * or the conversation exceeded the max context length. */ response_format?: { type: "text" | "json_object" }; } // Checks whether a suffix of s1 is a prefix of s2. For example, // ('Hello', 'Kira:') -> false // ('Hello Kira', 'Kira:') -> true const suffixOverlapsPrefix = (s1: string, s2: string) => { for (let i = 1; i <= Math.min(s1.length, s2.length); i++) { const suffix = s1.substring(s1.length - i); const prefix = s2.substring(0, i); if (suffix === prefix) { return true; } } return false; }; export class ChatCompletionContent { private readonly completion: AsyncIterable; private readonly stopWords: string[]; constructor( completion: AsyncIterable, stopWords: string[] ) { this.completion = completion; this.stopWords = stopWords; } async *readInner() { for await (const chunk of this.completion) { yield chunk.choices[0].delta.content; } } // stop words in OpenAI api don't always work. // So we have to truncate on our side. async *read() { let lastFragment = ""; for await (const data of this.readInner()) { lastFragment += data; let hasOverlap = false; for (const stopWord of this.stopWords) { const idx = lastFragment.indexOf(stopWord); if (idx >= 0) { yield lastFragment.substring(0, idx); return; } if (suffixOverlapsPrefix(lastFragment, stopWord)) { hasOverlap = true; } } if (hasOverlap) continue; yield lastFragment; lastFragment = ""; } yield lastFragment; } async readAll() { let allContent = ""; for await (const chunk of this.read()) { allContent += chunk; } return allContent; } } export async function ollamaFetchEmbedding(text: string) { const { result } = await retryWithBackoff(async () => { const resp = await fetch(apiUrl("/api/embeddings"), { method: "POST", headers: { "Content-Type": "application/json", }, body: JSON.stringify({ model: LLM_CONFIG.embeddingModel, prompt: text }), }); if (resp.status === 404) { const error = await resp.text(); await tryPullOllama(LLM_CONFIG.embeddingModel, error); throw new Error(`Failed to fetch embeddings: ${resp.status}`); } return (await resp.json()).embedding as number[]; }); return { embedding: result }; }