|
|
|
|
|
import requests |
|
|
|
HOST = '0.0.0.0:5000' |
|
|
|
|
|
def generate(prompt, tokens=200): |
|
request = {'prompt': prompt, 'max_new_tokens': tokens} |
|
response = requests.post(f'http://{HOST}/api/v1/generate', json=request) |
|
|
|
if response.status_code == 200: |
|
return response.json()['results'][0]['text'] |
|
|
|
|
|
def model_api(request): |
|
response = requests.post(f'http://{HOST}/api/v1/model', json=request) |
|
return response.json() |
|
|
|
|
|
|
|
def print_basic_model_info(response): |
|
basic_settings = ['truncation_length', 'instruction_template'] |
|
print("Model: ", response['result']['model_name']) |
|
print("Lora(s): ", response['result']['lora_names']) |
|
for setting in basic_settings: |
|
print(setting, "=", response['result']['shared.settings'][setting]) |
|
|
|
|
|
|
|
def model_info(): |
|
response = model_api({'action': 'info'}) |
|
print_basic_model_info(response) |
|
|
|
|
|
|
|
def model_load(model_name): |
|
return model_api({'action': 'load', 'model_name': model_name}) |
|
|
|
|
|
|
|
def complex_model_load(model): |
|
|
|
def guess_groupsize(model_name): |
|
if '1024g' in model_name: |
|
return 1024 |
|
elif '128g' in model_name: |
|
return 128 |
|
elif '32g' in model_name: |
|
return 32 |
|
else: |
|
return -1 |
|
|
|
req = { |
|
'action': 'load', |
|
'model_name': model, |
|
'args': { |
|
'loader': 'AutoGPTQ', |
|
|
|
'bf16': False, |
|
'load_in_8bit': False, |
|
'groupsize': 0, |
|
'wbits': 0, |
|
|
|
|
|
'threads': 0, |
|
'n_batch': 512, |
|
'no_mmap': False, |
|
'mlock': False, |
|
'cache_capacity': None, |
|
'n_gpu_layers': 0, |
|
'n_ctx': 2048, |
|
|
|
|
|
'rwkv_strategy': None, |
|
'rwkv_cuda_on': False, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}, |
|
} |
|
|
|
model = model.lower() |
|
|
|
if '4bit' in model or 'gptq' in model or 'int4' in model: |
|
req['args']['wbits'] = 4 |
|
req['args']['groupsize'] = guess_groupsize(model) |
|
elif '3bit' in model: |
|
req['args']['wbits'] = 3 |
|
req['args']['groupsize'] = guess_groupsize(model) |
|
else: |
|
req['args']['gptq_for_llama'] = False |
|
|
|
if '8bit' in model: |
|
req['args']['load_in_8bit'] = True |
|
elif '-hf' in model or 'fp16' in model: |
|
if '7b' in model: |
|
req['args']['bf16'] = True |
|
elif '13b' in model: |
|
req['args']['load_in_8bit'] = True |
|
elif 'ggml' in model: |
|
|
|
if '7b' in model: |
|
req['args']['n_gpu_layers'] = 100 |
|
elif '13b' in model: |
|
req['args']['n_gpu_layers'] = 100 |
|
elif '30b' in model or '33b' in model: |
|
req['args']['n_gpu_layers'] = 59 |
|
elif '65b' in model: |
|
req['args']['n_gpu_layers'] = 42 |
|
elif 'rwkv' in model: |
|
req['args']['rwkv_cuda_on'] = True |
|
if '14b' in model: |
|
req['args']['rwkv_strategy'] = 'cuda f16i8' |
|
else: |
|
req['args']['rwkv_strategy'] = 'cuda f16' |
|
|
|
return model_api(req) |
|
|
|
|
|
if __name__ == '__main__': |
|
for model in model_api({'action': 'list'})['result']: |
|
try: |
|
resp = complex_model_load(model) |
|
|
|
if 'error' in resp: |
|
print(f"β {model} FAIL Error: {resp['error']['message']}") |
|
continue |
|
else: |
|
print_basic_model_info(resp) |
|
|
|
ans = generate("0,1,1,2,3,5,8,13,", tokens=2) |
|
|
|
if '21' in ans: |
|
print(f"β
{model} PASS ({ans})") |
|
else: |
|
print(f"β {model} FAIL ({ans})") |
|
|
|
except Exception as e: |
|
print(f"β {model} FAIL Exception: {repr(e)}") |
|
|
|
|
|
|
|
|
|
""" $ ./model-api-example.py |
|
Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda |
|
Lora(s): [] |
|
truncation_length = 2048 |
|
instruction_template = Alpaca |
|
β
4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21) |
|
Model: 4bit_WizardLM-13B-Uncensored-4bit-128g |
|
Lora(s): [] |
|
truncation_length = 2048 |
|
instruction_template = WizardLM |
|
β
4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21) |
|
Model: Aeala_VicUnlocked-alpaca-30b-4bit |
|
Lora(s): [] |
|
truncation_length = 2048 |
|
instruction_template = Alpaca |
|
β
Aeala_VicUnlocked-alpaca-30b-4bit PASS (21) |
|
Model: alpaca-30b-4bit |
|
Lora(s): [] |
|
truncation_length = 2048 |
|
instruction_template = Alpaca |
|
β
alpaca-30b-4bit PASS (21) |
|
""" |
|
|