Anyone knows how to run it locally and with langchain

#5
by axeljeremy7 - opened

Running locally instead hub api

There's a GGML version of Wizard-vicuna with LlamaCpp support that's 13B as well. Maybe try that instead or is it any particular reason you want THIS model? If you get the GGML version you can simply load it with the LlamaCpp interface in langchain!

There's a GGML version of Wizard-vicuna with LlamaCpp support that's 13B as well. Maybe try that instead or is it any particular reason you want THIS model? If you get the GGML version you can simply load it with the LlamaCpp interface in langchain!

Is this true? do you now what version of ggml is supported right now in their llaamacpp? are they at v3 too?
?

assuming is webui is running and api is enabled on port 8080:

create this file,

llm_client.py:

from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any

import requests

HOST = 'localhost:8080'
URI = f'http://{HOST}/api/v1/generate'

class AlpacaLLM(LLM):
@property
def _llm_type(self) -> str:
return "custom"

def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
    if isinstance(stop, list):
        stop = stop + ["\n###","\nObservation:", "\nObservations:"]

    response = requests.post(
        URI,
        json={
            "prompt": prompt,
            "temperature": 0.7,
            "max_new_tokens": 500,
            "early_stopping": True,
            "stopping_strings": stop,
            'do_sample': True,
            'top_p': 0.1,
            'typical_p': 1,
            'repetition_penalty': 1.18,
            'top_k': 40,
            'min_length': 0,
            'no_repeat_ngram_size': 0,
            'num_beams': 1,
            'penalty_alpha': 0,
            'length_penalty': 1,
            'seed': -1,
            'add_bos_token': True,
            'truncation_length': 2048,
            'ban_eos_token': False,
            'skip_special_tokens': True,
        },
    )
    response.raise_for_status()
    return response.json()['results'][0]['text']

@property
def _identifying_params(self) -> Mapping[str, Any]:
    """Get the identifying parameters."""
    return {}

then use it like this:
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chains import ConversationChain

from llm_client import AlpacaLLM
llm = AlpacaLLM()
memory = ConversationBufferMemory()
conversation = ConversationChain(
llm = llm, verbose=True, memory=memory
)

conversation.predict(input="hi there, i am you doom")

Sign up or log in to comment