pluralchat

Runtime error

App Files Files Community

nsarrazin commited on Nov 22, 2023

Commit

b07f0b1

unverified ·

1 Parent(s): 6456af4

Add ollama endpoint support (#569)

Browse files

* Add ollama endpoint support

* replace if by switch

* Add Ollama example in docs

Files changed (5) hide show

README.md +35 -0
src/lib/server/endpoints/endpoints.ts +3 -0
src/lib/server/endpoints/llamacpp/endpointLlamacpp.ts +1 -1
src/lib/server/endpoints/ollama/endpointOllama.ts +108 -0
src/lib/server/models.ts +16 -12

README.md CHANGED Viewed

@@ -313,6 +313,41 @@ MODELS=[
 Start chat-ui with `npm run dev` and you should be able to chat with Zephyr locally.
 #### Amazon
 You can also specify your Amazon SageMaker instance as an endpoint for chat-ui. The config goes like this:

 Start chat-ui with `npm run dev` and you should be able to chat with Zephyr locally.
+#### Ollama
+We also support the Ollama inference server. Spin up a model with
+```cli
+ollama run mistral
+```
+Then specify the endpoints like so:
+```env
+MODELS=[
+  {
+      "name": "Ollama Mistral",
+      "chatPromptTemplate": "<s>{{#each messages}}{{#ifUser}}[INST] {{#if @first}}{{#if @root.preprompt}}{{@root.preprompt}}\n{{/if}}{{/if}} {{content}} [/INST]{{/ifUser}}{{#ifAssistant}}{{content}}</s> {{/ifAssistant}}{{/each}}",
+      "parameters": {
+        "temperature": 0.1,
+        "top_p": 0.95,
+        "repetition_penalty": 1.2,
+        "top_k": 50,
+        "truncate": 3072,
+        "max_new_tokens": 1024,
+        "stop": ["</s>"]
+      },
+      "endpoints": [
+        {
+         "type": "ollama",
+         "url" : "http://127.0.0.1:11434",
+         "ollamaName" : "mistral"
+        }
+      ]
+  }
+]
+```
 #### Amazon
 You can also specify your Amazon SageMaker instance as an endpoint for chat-ui. The config goes like this:

src/lib/server/endpoints/endpoints.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import { z } from "zod";
 import endpointAws, { endpointAwsParametersSchema } from "./aws/endpointAws";
 import { endpointOAIParametersSchema, endpointOai } from "./openai/endpointOai";
 import endpointLlamacpp, { endpointLlamacppParametersSchema } from "./llamacpp/endpointLlamacpp";
 // parameters passed when generating text
 interface EndpointParameters {
@@ -32,6 +33,7 @@ export const endpoints = {
 	aws: endpointAws,
 	openai: endpointOai,
 	llamacpp: endpointLlamacpp,
 };
 export const endpointSchema = z.discriminatedUnion("type", [
@@ -39,5 +41,6 @@ export const endpointSchema = z.discriminatedUnion("type", [
 	endpointOAIParametersSchema,
 	endpointTgiParametersSchema,
 	endpointLlamacppParametersSchema,
 ]);
 export default endpoints;

 import endpointAws, { endpointAwsParametersSchema } from "./aws/endpointAws";
 import { endpointOAIParametersSchema, endpointOai } from "./openai/endpointOai";
 import endpointLlamacpp, { endpointLlamacppParametersSchema } from "./llamacpp/endpointLlamacpp";
+import endpointOllama, { endpointOllamaParametersSchema } from "./ollama/endpointOllama";
 // parameters passed when generating text
 interface EndpointParameters {
 	aws: endpointAws,
 	openai: endpointOai,
 	llamacpp: endpointLlamacpp,
+	ollama: endpointOllama,
 };
 export const endpointSchema = z.discriminatedUnion("type", [
 	endpointOAIParametersSchema,
 	endpointTgiParametersSchema,
 	endpointLlamacppParametersSchema,
+	endpointOllamaParametersSchema,
 ]);
 export default endpoints;

src/lib/server/endpoints/llamacpp/endpointLlamacpp.ts CHANGED Viewed

@@ -8,7 +8,7 @@ export const endpointLlamacppParametersSchema = z.object({
 	weight: z.number().int().positive().default(1),
 	model: z.any(),
 	type: z.literal("llamacpp"),
-	url: z.string().url(),
 	accessToken: z.string().min(1).default(HF_ACCESS_TOKEN),
 });

 	weight: z.number().int().positive().default(1),
 	model: z.any(),
 	type: z.literal("llamacpp"),
+	url: z.string().url().default("http://127.0.0.1:8080"),
 	accessToken: z.string().min(1).default(HF_ACCESS_TOKEN),
 });

src/lib/server/endpoints/ollama/endpointOllama.ts ADDED Viewed

	@@ -0,0 +1,108 @@

+import { buildPrompt } from "$lib/buildPrompt";
+import type { TextGenerationStreamOutput } from "@huggingface/inference";
+import type { Endpoint } from "../endpoints";
+import { z } from "zod";
+export const endpointOllamaParametersSchema = z.object({
+	weight: z.number().int().positive().default(1),
+	model: z.any(),
+	type: z.literal("ollama"),
+	url: z.string().url().default("http://127.0.0.1:11434"),
+	ollamaName: z.string().min(1).optional(),
+});
+export function endpointOllama({
+	url,
+	model,
+	ollamaName,
+}: z.infer<typeof endpointOllamaParametersSchema>): Endpoint {
+	return async ({ conversation }) => {
+		const prompt = await buildPrompt({
+			messages: conversation.messages,
+			webSearch: conversation.messages[conversation.messages.length - 1].webSearch,
+			preprompt: conversation.preprompt,
+			model,
+		});
+		const r = await fetch(`${url}/api/generate`, {
+			method: "POST",
+			headers: {
+				"Content-Type": "application/json",
+			},
+			body: JSON.stringify({
+				prompt,
+				model: ollamaName ?? model.name,
+				raw: true,
+				options: {
+					top_p: model.parameters.top_p,
+					top_k: model.parameters.top_k,
+					temperature: model.parameters.temperature,
+					repeat_penalty: model.parameters.repetition_penalty,
+					stop: model.parameters.stop,
+					num_predict: model.parameters.max_new_tokens,
+				},
+			}),
+		});
+		if (!r.ok) {
+			throw new Error(`Failed to generate text: ${await r.text()}`);
+		}
+		const encoder = new TextDecoderStream();
+		const reader = r.body?.pipeThrough(encoder).getReader();
+		return (async function* () {
+			let generatedText = "";
+			let tokenId = 0;
+			let stop = false;
+			while (!stop) {
+				// read the stream and log the outputs to console
+				const out = (await reader?.read()) ?? { done: false, value: undefined };
+				// we read, if it's done we cancel
+				if (out.done) {
+					reader?.cancel();
+					return;
+				}
+				if (!out.value) {
+					return;
+				}
+				let data = null;
+				try {
+					data = JSON.parse(out.value);
+				} catch (e) {
+					return;
+				}
+				if (!data.done) {
+					generatedText += data.response;
+					yield {
+						token: {
+							id: tokenId++,
+							text: data.response ?? "",
+							logprob: 0,
+							special: false,
+						},
+						generated_text: null,
+						details: null,
+					} satisfies TextGenerationStreamOutput;
+				} else {
+					stop = true;
+					yield {
+						token: {
+							id: tokenId++,
+							text: data.response ?? "",
+							logprob: 0,
+							special: true,
+						},
+						generated_text: generatedText,
+						details: null,
+					} satisfies TextGenerationStreamOutput;
+				}
+			}
+		})();
+	};
+}
+export default endpointOllama;

src/lib/server/models.ts CHANGED Viewed

@@ -48,7 +48,7 @@ const modelConfig = z.object({
 	parameters: z
 		.object({
 			temperature: z.number().min(0).max(1),
-			truncate: z.number().int().positive(),
 			max_new_tokens: z.number().int().positive(),
 			stop: z.array(z.string()).optional(),
 			top_p: z.number().positive().optional(),
@@ -92,17 +92,21 @@ const addEndpoint = (m: Awaited<ReturnType<typeof processModel>>) => ({
 		for (const endpoint of m.endpoints) {
 			if (random < endpoint.weight) {
 				const args = { ...endpoint, model: m };
-				if (args.type === "tgi") {
-					return endpoints.tgi(args);
-				} else if (args.type === "aws") {
-					return await endpoints.aws(args);
-				} else if (args.type === "openai") {
-					return await endpoints.openai(args);
-				} else if (args.type === "llamacpp") {
-					return await endpoints.llamacpp(args);
-				} else {
-					// for legacy reason
-					return await endpoints.tgi(args);
 				}
 			}
 			random -= endpoint.weight;

 	parameters: z
 		.object({
 			temperature: z.number().min(0).max(1),
+			truncate: z.number().int().positive().optional(),
 			max_new_tokens: z.number().int().positive(),
 			stop: z.array(z.string()).optional(),
 			top_p: z.number().positive().optional(),
 		for (const endpoint of m.endpoints) {
 			if (random < endpoint.weight) {
 				const args = { ...endpoint, model: m };
+				switch (args.type) {
+					case "tgi":
+						return endpoints.tgi(args);
+					case "aws":
+						return await endpoints.aws(args);
+					case "openai":
+						return await endpoints.openai(args);
+					case "llamacpp":
+						return endpoints.llamacpp(args);
+					case "ollama":
+						return endpoints.ollama(args);
+					default:
+						// for legacy reason
+						return endpoints.tgi(args);
 				}
 			}
 			random -= endpoint.weight;