Spaces:
Configuration error
Configuration error
package main | |
// This is a wrapper to statisfy the GRPC service interface | |
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) | |
import ( | |
"fmt" | |
"path/filepath" | |
"github.com/go-skynet/go-llama.cpp" | |
"github.com/mudler/LocalAI/pkg/grpc/base" | |
pb "github.com/mudler/LocalAI/pkg/grpc/proto" | |
) | |
type LLM struct { | |
base.SingleThread | |
llama *llama.LLama | |
draftModel *llama.LLama | |
} | |
func (llm *LLM) Load(opts *pb.ModelOptions) error { | |
ropeFreqBase := float32(10000) | |
ropeFreqScale := float32(1) | |
if opts.RopeFreqBase != 0 { | |
ropeFreqBase = opts.RopeFreqBase | |
} | |
if opts.RopeFreqScale != 0 { | |
ropeFreqScale = opts.RopeFreqScale | |
} | |
llamaOpts := []llama.ModelOption{ | |
llama.WithRopeFreqBase(ropeFreqBase), | |
llama.WithRopeFreqScale(ropeFreqScale), | |
} | |
if opts.NoMulMatQ { | |
llamaOpts = append(llamaOpts, llama.SetMulMatQ(false)) | |
} | |
// Get base path of opts.ModelFile and use the same for lora (assume the same path) | |
basePath := filepath.Dir(opts.ModelFile) | |
if opts.LoraAdapter != "" { | |
llamaOpts = append(llamaOpts, llama.SetLoraAdapter(filepath.Join(basePath, opts.LoraAdapter))) | |
} | |
if opts.LoraBase != "" { | |
llamaOpts = append(llamaOpts, llama.SetLoraBase(filepath.Join(basePath, opts.LoraBase))) | |
} | |
if opts.ContextSize != 0 { | |
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) | |
} | |
if opts.F16Memory { | |
llamaOpts = append(llamaOpts, llama.EnableF16Memory) | |
} | |
if opts.Embeddings { | |
llamaOpts = append(llamaOpts, llama.EnableEmbeddings) | |
} | |
if opts.NGPULayers != 0 { | |
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) | |
} | |
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) | |
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) | |
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) | |
if opts.NBatch != 0 { | |
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) | |
} else { | |
llamaOpts = append(llamaOpts, llama.SetNBatch(512)) | |
} | |
if opts.NUMA { | |
llamaOpts = append(llamaOpts, llama.EnableNUMA) | |
} | |
if opts.LowVRAM { | |
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) | |
} | |
if opts.DraftModel != "" { | |
// https://github.com/ggerganov/llama.cpp/blob/71ca2fad7d6c0ef95ef9944fb3a1a843e481f314/examples/speculative/speculative.cpp#L40 | |
llamaOpts = append(llamaOpts, llama.SetPerplexity(true)) | |
} | |
model, err := llama.New(opts.ModelFile, llamaOpts...) | |
if opts.DraftModel != "" { | |
// opts.DraftModel is relative to opts.ModelFile, so we need to get the basepath of opts.ModelFile | |
if !filepath.IsAbs(opts.DraftModel) { | |
dir := filepath.Dir(opts.ModelFile) | |
opts.DraftModel = filepath.Join(dir, opts.DraftModel) | |
} | |
draftModel, err := llama.New(opts.DraftModel, llamaOpts...) | |
if err != nil { | |
return err | |
} | |
llm.draftModel = draftModel | |
} | |
llm.llama = model | |
return err | |
} | |
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { | |
ropeFreqBase := float32(10000) | |
ropeFreqScale := float32(1) | |
if opts.RopeFreqBase != 0 { | |
ropeFreqBase = opts.RopeFreqBase | |
} | |
if opts.RopeFreqScale != 0 { | |
ropeFreqScale = opts.RopeFreqScale | |
} | |
predictOptions := []llama.PredictOption{ | |
llama.SetTemperature(opts.Temperature), | |
llama.SetTopP(opts.TopP), | |
llama.SetTopK(int(opts.TopK)), | |
llama.SetTokens(int(opts.Tokens)), | |
llama.SetThreads(int(opts.Threads)), | |
llama.WithGrammar(opts.Grammar), | |
llama.SetRopeFreqBase(ropeFreqBase), | |
llama.SetRopeFreqScale(ropeFreqScale), | |
llama.SetNegativePromptScale(opts.NegativePromptScale), | |
llama.SetNegativePrompt(opts.NegativePrompt), | |
} | |
if opts.PromptCacheAll { | |
predictOptions = append(predictOptions, llama.EnablePromptCacheAll) | |
} | |
if opts.PromptCacheRO { | |
predictOptions = append(predictOptions, llama.EnablePromptCacheRO) | |
} | |
// Expected absolute path | |
if opts.PromptCachePath != "" { | |
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) | |
} | |
if opts.Mirostat != 0 { | |
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) | |
} | |
if opts.MirostatETA != 0 { | |
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA)) | |
} | |
if opts.MirostatTAU != 0 { | |
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU)) | |
} | |
if opts.Debug { | |
predictOptions = append(predictOptions, llama.Debug) | |
} | |
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) | |
if opts.PresencePenalty != 0 { | |
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty)) | |
} | |
if opts.NKeep != 0 { | |
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) | |
} | |
if opts.Batch != 0 { | |
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) | |
} | |
if opts.F16KV { | |
predictOptions = append(predictOptions, llama.EnableF16KV) | |
} | |
if opts.IgnoreEOS { | |
predictOptions = append(predictOptions, llama.IgnoreEOS) | |
} | |
if opts.Seed != 0 { | |
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) | |
} | |
if opts.NDraft != 0 { | |
predictOptions = append(predictOptions, llama.SetNDraft(int(opts.NDraft))) | |
} | |
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) | |
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty)) | |
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) | |
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) | |
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) | |
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) | |
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ)) | |
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP)) | |
return predictOptions | |
} | |
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { | |
if llm.draftModel != nil { | |
return llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...) | |
} | |
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) | |
} | |
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { | |
predictOptions := buildPredictOptions(opts) | |
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { | |
results <- token | |
return true | |
})) | |
go func() { | |
var err error | |
if llm.draftModel != nil { | |
_, err = llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...) | |
} else { | |
_, err = llm.llama.Predict(opts.Prompt, predictOptions...) | |
} | |
if err != nil { | |
fmt.Println("err: ", err) | |
} | |
close(results) | |
}() | |
return nil | |
} | |
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { | |
predictOptions := buildPredictOptions(opts) | |
if len(opts.EmbeddingTokens) > 0 { | |
tokens := []int{} | |
for _, t := range opts.EmbeddingTokens { | |
tokens = append(tokens, int(t)) | |
} | |
return llm.llama.TokenEmbeddings(tokens, predictOptions...) | |
} | |
return llm.llama.Embeddings(opts.Embeddings, predictOptions...) | |
} | |
func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) { | |
predictOptions := buildPredictOptions(opts) | |
l, tokens, err := llm.llama.TokenizeString(opts.Prompt, predictOptions...) | |
if err != nil { | |
return pb.TokenizationResponse{}, err | |
} | |
return pb.TokenizationResponse{ | |
Length: l, | |
Tokens: tokens, | |
}, nil | |
} | |