rodrigomasini commited on
Commit
dd7ed3d
1 Parent(s): 2a9b1c8

Upload 4 files

Browse files
api-examples/api-example-chat-stream.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import sys
4
+
5
+ try:
6
+ import websockets
7
+ except ImportError:
8
+ print("Websockets package not found. Make sure it's installed.")
9
+
10
+ # For local streaming, the websockets are hosted without ssl - ws://
11
+ HOST = 'localhost:5005'
12
+ URI = f'ws://{HOST}/api/v1/chat-stream'
13
+
14
+ # For reverse-proxied streaming, the remote will likely host with ssl - wss://
15
+ # URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
16
+
17
+
18
+ async def run(user_input, history):
19
+ # Note: the selected defaults change from time to time.
20
+ request = {
21
+ 'user_input': user_input,
22
+ 'max_new_tokens': 250,
23
+ 'history': history,
24
+ 'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
25
+ 'character': 'Example',
26
+ 'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset
27
+ # 'context_instruct': '', # Optional
28
+ 'your_name': 'You',
29
+
30
+ 'regenerate': False,
31
+ '_continue': False,
32
+ 'stop_at_newline': False,
33
+ 'chat_generation_attempts': 1,
34
+ 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
35
+
36
+ # Generation params. If 'preset' is set to different than 'None', the values
37
+ # in presets/preset-name.yaml are used instead of the individual numbers.
38
+ 'preset': 'None',
39
+ 'do_sample': True,
40
+ 'temperature': 0.7,
41
+ 'top_p': 0.1,
42
+ 'typical_p': 1,
43
+ 'epsilon_cutoff': 0, # In units of 1e-4
44
+ 'eta_cutoff': 0, # In units of 1e-4
45
+ 'tfs': 1,
46
+ 'top_a': 0,
47
+ 'repetition_penalty': 1.18,
48
+ 'repetition_penalty_range': 0,
49
+ 'top_k': 40,
50
+ 'min_length': 0,
51
+ 'no_repeat_ngram_size': 0,
52
+ 'num_beams': 1,
53
+ 'penalty_alpha': 0,
54
+ 'length_penalty': 1,
55
+ 'early_stopping': False,
56
+ 'mirostat_mode': 0,
57
+ 'mirostat_tau': 5,
58
+ 'mirostat_eta': 0.1,
59
+
60
+ 'seed': -1,
61
+ 'add_bos_token': True,
62
+ 'truncation_length': 2048,
63
+ 'ban_eos_token': False,
64
+ 'skip_special_tokens': True,
65
+ 'stopping_strings': []
66
+ }
67
+
68
+ async with websockets.connect(URI, ping_interval=None) as websocket:
69
+ await websocket.send(json.dumps(request))
70
+
71
+ while True:
72
+ incoming_data = await websocket.recv()
73
+ incoming_data = json.loads(incoming_data)
74
+
75
+ match incoming_data['event']:
76
+ case 'text_stream':
77
+ yield incoming_data['history']
78
+ case 'stream_end':
79
+ return
80
+
81
+
82
+ async def print_response_stream(user_input, history):
83
+ cur_len = 0
84
+ async for new_history in run(user_input, history):
85
+ cur_message = new_history['visible'][-1][1][cur_len:]
86
+ cur_len += len(cur_message)
87
+ print(cur_message, end='')
88
+ sys.stdout.flush() # If we don't flush, we won't see tokens in realtime.
89
+
90
+
91
+ if __name__ == '__main__':
92
+ user_input = "Please give me a step-by-step guide on how to plant a tree in my backyard."
93
+
94
+ # Basic example
95
+ history = {'internal': [], 'visible': []}
96
+
97
+ # "Continue" example. Make sure to set '_continue' to True above
98
+ # arr = [user_input, 'Surely, here is']
99
+ # history = {'internal': [arr], 'visible': [arr]}
100
+
101
+ asyncio.run(print_response_stream(user_input, history))
api-examples/api-example-model.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import requests
4
+
5
+ HOST = '0.0.0.0:5000'
6
+
7
+
8
+ def generate(prompt, tokens=200):
9
+ request = {'prompt': prompt, 'max_new_tokens': tokens}
10
+ response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
11
+
12
+ if response.status_code == 200:
13
+ return response.json()['results'][0]['text']
14
+
15
+
16
+ def model_api(request):
17
+ response = requests.post(f'http://{HOST}/api/v1/model', json=request)
18
+ return response.json()
19
+
20
+
21
+ # print some common settings
22
+ def print_basic_model_info(response):
23
+ basic_settings = ['truncation_length', 'instruction_template']
24
+ print("Model: ", response['result']['model_name'])
25
+ print("Lora(s): ", response['result']['lora_names'])
26
+ for setting in basic_settings:
27
+ print(setting, "=", response['result']['shared.settings'][setting])
28
+
29
+
30
+ # model info
31
+ def model_info():
32
+ response = model_api({'action': 'info'})
33
+ print_basic_model_info(response)
34
+
35
+
36
+ # simple loader
37
+ def model_load(model_name):
38
+ return model_api({'action': 'load', 'model_name': model_name})
39
+
40
+
41
+ # complex loader
42
+ def complex_model_load(model):
43
+
44
+ def guess_groupsize(model_name):
45
+ if '1024g' in model_name:
46
+ return 1024
47
+ elif '128g' in model_name:
48
+ return 128
49
+ elif '32g' in model_name:
50
+ return 32
51
+ else:
52
+ return -1
53
+
54
+ req = {
55
+ 'action': 'load',
56
+ 'model_name': model,
57
+ 'args': {
58
+ 'loader': 'AutoGPTQ',
59
+
60
+ 'bf16': False,
61
+ 'load_in_8bit': False,
62
+ 'groupsize': 0,
63
+ 'wbits': 0,
64
+
65
+ # llama.cpp
66
+ 'threads': 0,
67
+ 'n_batch': 512,
68
+ 'no_mmap': False,
69
+ 'mlock': False,
70
+ 'cache_capacity': None,
71
+ 'n_gpu_layers': 0,
72
+ 'n_ctx': 2048,
73
+
74
+ # RWKV
75
+ 'rwkv_strategy': None,
76
+ 'rwkv_cuda_on': False,
77
+
78
+ # b&b 4-bit
79
+ # 'load_in_4bit': False,
80
+ # 'compute_dtype': 'float16',
81
+ # 'quant_type': 'nf4',
82
+ # 'use_double_quant': False,
83
+
84
+ # "cpu": false,
85
+ # "auto_devices": false,
86
+ # "gpu_memory": null,
87
+ # "cpu_memory": null,
88
+ # "disk": false,
89
+ # "disk_cache_dir": "cache",
90
+ },
91
+ }
92
+
93
+ model = model.lower()
94
+
95
+ if '4bit' in model or 'gptq' in model or 'int4' in model:
96
+ req['args']['wbits'] = 4
97
+ req['args']['groupsize'] = guess_groupsize(model)
98
+ elif '3bit' in model:
99
+ req['args']['wbits'] = 3
100
+ req['args']['groupsize'] = guess_groupsize(model)
101
+ else:
102
+ req['args']['gptq_for_llama'] = False
103
+
104
+ if '8bit' in model:
105
+ req['args']['load_in_8bit'] = True
106
+ elif '-hf' in model or 'fp16' in model:
107
+ if '7b' in model:
108
+ req['args']['bf16'] = True # for 24GB
109
+ elif '13b' in model:
110
+ req['args']['load_in_8bit'] = True # for 24GB
111
+ elif 'ggml' in model:
112
+ # req['args']['threads'] = 16
113
+ if '7b' in model:
114
+ req['args']['n_gpu_layers'] = 100
115
+ elif '13b' in model:
116
+ req['args']['n_gpu_layers'] = 100
117
+ elif '30b' in model or '33b' in model:
118
+ req['args']['n_gpu_layers'] = 59 # 24GB
119
+ elif '65b' in model:
120
+ req['args']['n_gpu_layers'] = 42 # 24GB
121
+ elif 'rwkv' in model:
122
+ req['args']['rwkv_cuda_on'] = True
123
+ if '14b' in model:
124
+ req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
125
+ else:
126
+ req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
127
+
128
+ return model_api(req)
129
+
130
+
131
+ if __name__ == '__main__':
132
+ for model in model_api({'action': 'list'})['result']:
133
+ try:
134
+ resp = complex_model_load(model)
135
+
136
+ if 'error' in resp:
137
+ print(f"❌ {model} FAIL Error: {resp['error']['message']}")
138
+ continue
139
+ else:
140
+ print_basic_model_info(resp)
141
+
142
+ ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
143
+
144
+ if '21' in ans:
145
+ print(f"✅ {model} PASS ({ans})")
146
+ else:
147
+ print(f"❌ {model} FAIL ({ans})")
148
+
149
+ except Exception as e:
150
+ print(f"❌ {model} FAIL Exception: {repr(e)}")
151
+
152
+
153
+ # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
154
+ # Some results below.
155
+ """ $ ./model-api-example.py
156
+ Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
157
+ Lora(s): []
158
+ truncation_length = 2048
159
+ instruction_template = Alpaca
160
+ ✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21)
161
+ Model: 4bit_WizardLM-13B-Uncensored-4bit-128g
162
+ Lora(s): []
163
+ truncation_length = 2048
164
+ instruction_template = WizardLM
165
+ ✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21)
166
+ Model: Aeala_VicUnlocked-alpaca-30b-4bit
167
+ Lora(s): []
168
+ truncation_length = 2048
169
+ instruction_template = Alpaca
170
+ ✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21)
171
+ Model: alpaca-30b-4bit
172
+ Lora(s): []
173
+ truncation_length = 2048
174
+ instruction_template = Alpaca
175
+ ✅ alpaca-30b-4bit PASS (21)
176
+ """
api-examples/api-example-stream.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import sys
4
+
5
+ try:
6
+ import websockets
7
+ except ImportError:
8
+ print("Websockets package not found. Make sure it's installed.")
9
+
10
+ # For local streaming, the websockets are hosted without ssl - ws://
11
+ HOST = 'localhost:5005'
12
+ URI = f'ws://{HOST}/api/v1/stream'
13
+
14
+ # For reverse-proxied streaming, the remote will likely host with ssl - wss://
15
+ # URI = 'wss://your-uri-here.trycloudflare.com/api/v1/stream'
16
+
17
+
18
+ async def run(context):
19
+ # Note: the selected defaults change from time to time.
20
+ request = {
21
+ 'prompt': context,
22
+ 'max_new_tokens': 250,
23
+
24
+ # Generation params. If 'preset' is set to different than 'None', the values
25
+ # in presets/preset-name.yaml are used instead of the individual numbers.
26
+ 'preset': 'None',
27
+ 'do_sample': True,
28
+ 'temperature': 0.7,
29
+ 'top_p': 0.1,
30
+ 'typical_p': 1,
31
+ 'epsilon_cutoff': 0, # In units of 1e-4
32
+ 'eta_cutoff': 0, # In units of 1e-4
33
+ 'tfs': 1,
34
+ 'top_a': 0,
35
+ 'repetition_penalty': 1.18,
36
+ 'repetition_penalty_range': 0,
37
+ 'top_k': 40,
38
+ 'min_length': 0,
39
+ 'no_repeat_ngram_size': 0,
40
+ 'num_beams': 1,
41
+ 'penalty_alpha': 0,
42
+ 'length_penalty': 1,
43
+ 'early_stopping': False,
44
+ 'mirostat_mode': 0,
45
+ 'mirostat_tau': 5,
46
+ 'mirostat_eta': 0.1,
47
+
48
+ 'seed': -1,
49
+ 'add_bos_token': True,
50
+ 'truncation_length': 2048,
51
+ 'ban_eos_token': False,
52
+ 'skip_special_tokens': True,
53
+ 'stopping_strings': []
54
+ }
55
+
56
+ async with websockets.connect(URI, ping_interval=None) as websocket:
57
+ await websocket.send(json.dumps(request))
58
+
59
+ yield context # Remove this if you just want to see the reply
60
+
61
+ while True:
62
+ incoming_data = await websocket.recv()
63
+ incoming_data = json.loads(incoming_data)
64
+
65
+ match incoming_data['event']:
66
+ case 'text_stream':
67
+ yield incoming_data['text']
68
+ case 'stream_end':
69
+ return
70
+
71
+
72
+ async def print_response_stream(prompt):
73
+ async for response in run(prompt):
74
+ print(response, end='')
75
+ sys.stdout.flush() # If we don't flush, we won't see tokens in realtime.
76
+
77
+
78
+ if __name__ == '__main__':
79
+ prompt = "In order to make homemade bread, follow these steps:\n1)"
80
+ asyncio.run(print_response_stream(prompt))
api-examples/api-example.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ # For local streaming, the websockets are hosted without ssl - http://
4
+ HOST = 'localhost:5000'
5
+ URI = f'http://{HOST}/api/v1/generate'
6
+
7
+ # For reverse-proxied streaming, the remote will likely host with ssl - https://
8
+ # URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'
9
+
10
+
11
+ def run(prompt):
12
+ request = {
13
+ 'prompt': prompt,
14
+ 'max_new_tokens': 250,
15
+
16
+ # Generation params. If 'preset' is set to different than 'None', the values
17
+ # in presets/preset-name.yaml are used instead of the individual numbers.
18
+ 'preset': 'None',
19
+ 'do_sample': True,
20
+ 'temperature': 0.7,
21
+ 'top_p': 0.1,
22
+ 'typical_p': 1,
23
+ 'epsilon_cutoff': 0, # In units of 1e-4
24
+ 'eta_cutoff': 0, # In units of 1e-4
25
+ 'tfs': 1,
26
+ 'top_a': 0,
27
+ 'repetition_penalty': 1.18,
28
+ 'repetition_penalty_range': 0,
29
+ 'top_k': 40,
30
+ 'min_length': 0,
31
+ 'no_repeat_ngram_size': 0,
32
+ 'num_beams': 1,
33
+ 'penalty_alpha': 0,
34
+ 'length_penalty': 1,
35
+ 'early_stopping': False,
36
+ 'mirostat_mode': 0,
37
+ 'mirostat_tau': 5,
38
+ 'mirostat_eta': 0.1,
39
+
40
+ 'seed': -1,
41
+ 'add_bos_token': True,
42
+ 'truncation_length': 2048,
43
+ 'ban_eos_token': False,
44
+ 'skip_special_tokens': True,
45
+ 'stopping_strings': []
46
+ }
47
+
48
+ response = requests.post(URI, json=request)
49
+
50
+ if response.status_code == 200:
51
+ result = response.json()['results'][0]['text']
52
+ print(prompt + result)
53
+
54
+
55
+ if __name__ == '__main__':
56
+ prompt = "In order to make homemade bread, follow these steps:\n1)"
57
+ run(prompt)