a100 kh commited on
Commit
0f859ae
·
1 Parent(s): cac0300
api_endpoints copy.json → api_endpoints all.json RENAMED
@@ -120,8 +120,8 @@
120
  "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0": {
121
  "model_name": "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0",
122
  "api_type": "openai-llama3.1",
123
- "api_base": "http://localhost:8010/v1",
124
- "api_key": "12345",
125
  "anony_only": false,
126
  "recommended_config": {
127
  "temperature": 0.7,
@@ -133,8 +133,8 @@
133
  "cyberagent/calm3-22b-chat-BitsAndBytes": {
134
  "model_name": "cyberagent/calm3-22b-chat",
135
  "api_type": "openai-custom-calm",
136
- "api_base": "http://localhost:8011/v1",
137
- "api_key": "12345",
138
  "anony_only": false,
139
  "recommended_config": {
140
  "temperature": 0.7,
@@ -146,8 +146,8 @@
146
  "weblab-GENIAC/Tanuki-8B-dpo-v1.0-BitsAndBytes": {
147
  "model_name": "weblab-GENIAC/Tanuki-8B-dpo-v1.0",
148
  "api_type": "openai-custom-tanuki",
149
- "api_base": "http://localhost:8012/v1",
150
- "api_key": "12345",
151
  "anony_only": false,
152
  "recommended_config": {
153
  "temperature": 0.7,
@@ -159,8 +159,8 @@
159
  "llm-jp-3-13b-instruct-Q8_0.gguf": {
160
  "model_name": "llm-jp-3-13b-instruct-Q8_0.gguf",
161
  "api_type": "openai-llmjp3",
162
- "api_base": "http://localhost:8016/v1",
163
- "api_key": "12345",
164
  "anony_only": false,
165
  "recommended_config": {
166
  "temperature": 0.7,
@@ -172,8 +172,8 @@
172
  "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1-BitsAndBytes": {
173
  "model_name": "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1",
174
  "api_type": "openai-llama3.1",
175
- "api_base": "http://localhost:8019/v1",
176
- "api_key": "12345",
177
  "anony_only": false,
178
  "recommended_config": {
179
  "temperature": 0.7,
 
120
  "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0": {
121
  "model_name": "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0",
122
  "api_type": "openai-llama3.1",
123
+ "api_end": "Swallow-8B",
124
+ "env_api_key": "VLLM_API_KEY",
125
  "anony_only": false,
126
  "recommended_config": {
127
  "temperature": 0.7,
 
133
  "cyberagent/calm3-22b-chat-BitsAndBytes": {
134
  "model_name": "cyberagent/calm3-22b-chat",
135
  "api_type": "openai-custom-calm",
136
+ "api_end": "calm3-22b-chat",
137
+ "env_api_key": "VLLM_API_KEY",
138
  "anony_only": false,
139
  "recommended_config": {
140
  "temperature": 0.7,
 
146
  "weblab-GENIAC/Tanuki-8B-dpo-v1.0-BitsAndBytes": {
147
  "model_name": "weblab-GENIAC/Tanuki-8B-dpo-v1.0",
148
  "api_type": "openai-custom-tanuki",
149
+ "api_end": "Tanuki-8B-dpo",
150
+ "env_api_key": "VLLM_API_KEY",
151
  "anony_only": false,
152
  "recommended_config": {
153
  "temperature": 0.7,
 
159
  "llm-jp-3-13b-instruct-Q8_0.gguf": {
160
  "model_name": "llm-jp-3-13b-instruct-Q8_0.gguf",
161
  "api_type": "openai-llmjp3",
162
+ "api_end": "llm-jp-13b",
163
+ "env_api_key": "VLLM_API_KEY",
164
  "anony_only": false,
165
  "recommended_config": {
166
  "temperature": 0.7,
 
172
  "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1-BitsAndBytes": {
173
  "model_name": "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1",
174
  "api_type": "openai-llama3.1",
175
+ "api_end": "swallow70",
176
+ "env_api_key": "VLLM_API_KEY",
177
  "anony_only": false,
178
  "recommended_config": {
179
  "temperature": 0.7,
api_endpoints.json CHANGED
@@ -116,5 +116,70 @@
116
  },
117
  "text-arena": true,
118
  "vision-arena": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  }
120
  }
 
116
  },
117
  "text-arena": true,
118
  "vision-arena": false
119
+ },
120
+ "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0": {
121
+ "model_name": "tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0",
122
+ "api_type": "openai-llama3.1",
123
+ "api_end": "Swallow-8B",
124
+ "env_api_key": "VLLM_API_KEY",
125
+ "anony_only": false,
126
+ "recommended_config": {
127
+ "temperature": 0.7,
128
+ "top_p": 1.0
129
+ },
130
+ "text-arena": true,
131
+ "vision-arena": false
132
+ },
133
+ "cyberagent/calm3-22b-chat-BitsAndBytes": {
134
+ "model_name": "cyberagent/calm3-22b-chat",
135
+ "api_type": "openai-custom-calm",
136
+ "api_end": "calm3-22b-chat",
137
+ "env_api_key": "VLLM_API_KEY",
138
+ "anony_only": false,
139
+ "recommended_config": {
140
+ "temperature": 0.7,
141
+ "top_p": 1.0
142
+ },
143
+ "text-arena": true,
144
+ "vision-arena": false
145
+ },
146
+ "weblab-GENIAC/Tanuki-8B-dpo-v1.0-BitsAndBytes": {
147
+ "model_name": "weblab-GENIAC/Tanuki-8B-dpo-v1.0",
148
+ "api_type": "openai-custom-tanuki",
149
+ "api_end": "Tanuki-8B-dpo",
150
+ "env_api_key": "VLLM_API_KEY",
151
+ "anony_only": false,
152
+ "recommended_config": {
153
+ "temperature": 0.7,
154
+ "top_p": 1.0
155
+ },
156
+ "text-arena": true,
157
+ "vision-arena": false
158
+ },
159
+ "llm-jp-3-13b-instruct-Q8_0.gguf": {
160
+ "model_name": "llm-jp-3-13b-instruct-Q8_0.gguf",
161
+ "api_type": "openai-llmjp3",
162
+ "api_end": "llm-jp-13b",
163
+ "env_api_key": "VLLM_API_KEY",
164
+ "anony_only": false,
165
+ "recommended_config": {
166
+ "temperature": 0.7,
167
+ "top_p": 1.0
168
+ },
169
+ "text-arena": true,
170
+ "vision-arena": false
171
+ },
172
+ "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1-BitsAndBytes": {
173
+ "model_name": "tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1",
174
+ "api_type": "openai-llama3.1",
175
+ "api_end": "swallow70",
176
+ "env_api_key": "VLLM_API_KEY",
177
+ "anony_only": false,
178
+ "recommended_config": {
179
+ "temperature": 0.7,
180
+ "top_p": 1.0
181
+ },
182
+ "text-arena": true,
183
+ "vision-arena": false
184
  }
185
  }
api_endpoints_apis.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "claude-3-5-sonnet-20240620": {
3
+ "model_name": "claude-3-5-sonnet-20240620",
4
+ "api_type": "anthropic",
5
+ "anony_only": false,
6
+ "recommended_config": {
7
+ "temperature": 0.7,
8
+ "top_p": 1.0
9
+ },
10
+ "text-arena": true,
11
+ "vision-arena": false
12
+ },
13
+ "command-r-plus": {
14
+ "model_name": "command-r-plus",
15
+ "api_type": "cohere",
16
+ "anony_only": false,
17
+ "recommended_config": {
18
+ "temperature": 0.7,
19
+ "top_p": 1.0
20
+ },
21
+ "text-arena": true,
22
+ "vision-arena": false
23
+ },
24
+ "deepseek-chat": {
25
+ "model_name": "deepseek-chat",
26
+ "api_type": "openai-custom-deepinfra",
27
+ "api_base": "https://api.deepseek.com/v1",
28
+ "env_api_key": "DEEPSEEK_API_KEY",
29
+ "anony_only": false,
30
+ "recommended_config": {
31
+ "temperature": 0.7,
32
+ "top_p": 1.0
33
+ },
34
+ "text-arena": true,
35
+ "vision-arena": false
36
+ },
37
+ "mistral-large-latest": {
38
+ "model_name": "mistral-large-latest",
39
+ "api_type": "mistral",
40
+ "anony_only": false,
41
+ "recommended_config": {
42
+ "temperature": 0.7,
43
+ "top_p": 1.0
44
+ },
45
+ "text-arena": true,
46
+ "vision-arena": false
47
+ },
48
+ "Qwen/Qwen2.5-72B-Instruct": {
49
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
50
+ "api_type": "openai-custom-deepinfra",
51
+ "api_base": "https://api.deepinfra.com/v1/openai",
52
+ "env_api_key": "DEEPINFRA_API_KEY",
53
+ "anony_only": false,
54
+ "recommended_config": {
55
+ "temperature": 0.7,
56
+ "top_p": 1.0
57
+ },
58
+ "text-arena": true,
59
+ "vision-arena": false
60
+ },
61
+ "google/gemma-2-27b-it": {
62
+ "model_name": "google/gemma-2-27b-it",
63
+ "api_type": "openai-custom-deepinfra",
64
+ "api_base": "https://api.deepinfra.com/v1/openai",
65
+ "env_api_key": "DEEPINFRA_API_KEY",
66
+ "anony_only": false,
67
+ "recommended_config": {
68
+ "temperature": 0.7,
69
+ "top_p": 1.0
70
+ },
71
+ "text-arena": true,
72
+ "vision-arena": false
73
+ },
74
+ "gemini-1.5-flash-latest": {
75
+ "model_name": "gemini-1.5-flash-latest",
76
+ "api_type": "gemini",
77
+ "anony_only": false,
78
+ "recommended_config": {
79
+ "temperature": 0.7,
80
+ "top_p": 1.0
81
+ },
82
+ "text-arena": true,
83
+ "vision-arena": false
84
+ },
85
+ "gemini-1.5-pro-latest": {
86
+ "model_name": "gemini-1.5-pro-latest",
87
+ "api_type": "gemini",
88
+ "anony_only": false,
89
+ "recommended_config": {
90
+ "temperature": 0.7,
91
+ "top_p": 1.0
92
+ },
93
+ "text-arena": true,
94
+ "vision-arena": false
95
+ },
96
+ "gpt-4-turbo-2024-04-09": {
97
+ "model_name": "gpt-4-turbo-2024-04-09",
98
+ "api_type": "openai",
99
+ "api_base": "https://api.openai.com/v1",
100
+ "anony_only": false,
101
+ "recommended_config": {
102
+ "temperature": 0.7,
103
+ "top_p": 1.0
104
+ },
105
+ "text-arena": true,
106
+ "vision-arena": false
107
+ },
108
+ "gpt-4o-mini-2024-07-18": {
109
+ "model_name": "gpt-4o-mini-2024-07-18",
110
+ "api_type": "openai",
111
+ "api_base": "https://api.openai.com/v1",
112
+ "anony_only": false,
113
+ "recommended_config": {
114
+ "temperature": 0.7,
115
+ "top_p": 1.0
116
+ },
117
+ "text-arena": true,
118
+ "vision-arena": false
119
+ }
120
+ }
local/local_setup ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #install ngninx
2
+ sudo apt update
3
+ sudo apt install nginx
4
+
5
+
6
+ #lauch local server
7
+
8
+ export CUDA_VISIBLE_DEVICES=0
9
+ python -m vllm.entrypoints.openai.api_server --model cyberagent/calm3-22b-chat \
10
+ --max-model-len 4096 --port 8011 \
11
+ --gpu-memory-utilization 0.4 --trust-remote-code \
12
+ --quantization bitsandbytes --load-format bitsandbytes \
13
+ --api-key $VLLM_API_KEY
14
+
15
+ #vllm tanuki8
16
+ export CUDA_VISIBLE_DEVICES=0
17
+ python -m vllm.entrypoints.openai.api_server --model weblab-GENIAC/Tanuki-8B-dpo-v1.0 --max-model-len 4096 --port 8012 --gpu-memory-utilization 0.2 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY
18
+
19
+
20
+ export CUDA_VISIBLE_DEVICES=0
21
+ #llama.cpp swallow 8b
22
+ ../llama-server -m tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0.gguf --n_gpu_layers 100 --port 8010
23
+
24
+ #llmjp13b
25
+ export CUDA_VISIBLE_DEVICES=0
26
+ ../llama-server -m llm-jp-3-13b-instruct-Q8_0.gguf --n_gpu_layers 100 --port 8016
27
+
28
+ #swallow70
29
+ export CUDA_VISIBLE_DEVICES=1
30
+ python -m vllm.entrypoints.openai.api_server --model tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1 --max-model-len 4096 --port 8019 --gpu-memory-utilization 0.6 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY
31
+
32
+
33
+ #launch ngrok
34
+ ngrok http http://localhost:8765
local/nginx ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #sudo vi /etc/nginx/sites-available/default
2
+ #sudo systemctl restart nginx
3
+
4
+
5
+ server {
6
+ listen 8765; # 一つのポートでまとめる
7
+
8
+ location /swallow70/ {proxy_pass http://localhost:8019/v1/;}
9
+ location /llm-jp-13b/ {proxy_pass http://localhost:8016/v1/;}
10
+ location /Tanuki-8B-dpo/ {proxy_pass http://localhost:8012/v1/;}
11
+ location /calm3-22b-chat/ {proxy_pass http://localhost:8011/v1/;}
12
+ location /Swallow-8B/ {proxy_pass http://localhost:8010/v1/;}
13
+ }
14
+
serve/api_provider.py CHANGED
@@ -54,6 +54,13 @@ def get_api_provider_stream_iter(
54
  else:
55
  api_key = os.environ[model_api_dict["env_api_key"]]
56
 
 
 
 
 
 
 
 
57
  messages = conv.to_openai_api_messages()
58
  stream_iter = openai_api_stream_iter(
59
  model_api_dict["model_name"],
@@ -61,7 +68,7 @@ def get_api_provider_stream_iter(
61
  temperature,
62
  top_p,
63
  max_new_tokens,
64
- api_base=model_api_dict["api_base"],
65
  api_key=api_key,
66
  # api_key=os.environ[model_api_dict["env_api_key"]],
67
  # api_key=model_api_dict["api_key"],
@@ -77,8 +84,8 @@ def get_api_provider_stream_iter(
77
  temperature,
78
  top_p,
79
  max_new_tokens,
80
- api_base=model_api_dict["api_base"],
81
- api_key=model_api_dict["api_key"],
82
  stop="<|im_end|>",
83
  )
84
  elif model_api_dict["api_type"] == "openai-llmjp3":
@@ -92,8 +99,8 @@ def get_api_provider_stream_iter(
92
  temperature,
93
  top_p,
94
  max_new_tokens,
95
- api_base=model_api_dict["api_base"],
96
- api_key=model_api_dict["api_key"],
97
  stop="<|im_end|>",
98
  )
99
  elif model_api_dict["api_type"] == "openai_no_stream":
 
54
  else:
55
  api_key = os.environ[model_api_dict["env_api_key"]]
56
 
57
+ if "api_base" in model_api_dict:
58
+ api_base = model_api_dict["api_base"]
59
+ elif "api_end" in model_api_dict:
60
+ api_base = os.environ["LOCAL_LLM_URL"]
61
+ end = model_api_dict["api_end"]
62
+ api_base = f"{api_base}/{end}/"
63
+
64
  messages = conv.to_openai_api_messages()
65
  stream_iter = openai_api_stream_iter(
66
  model_api_dict["model_name"],
 
68
  temperature,
69
  top_p,
70
  max_new_tokens,
71
+ api_base=api_base,
72
  api_key=api_key,
73
  # api_key=os.environ[model_api_dict["env_api_key"]],
74
  # api_key=model_api_dict["api_key"],
 
84
  temperature,
85
  top_p,
86
  max_new_tokens,
87
+ api_base=f"{os.environ["LOCAL_LLM_URL"]}/{model_api_dict["api_end"]}/",
88
+ api_key=os.environ[model_api_dict["env_api_key"]],
89
  stop="<|im_end|>",
90
  )
91
  elif model_api_dict["api_type"] == "openai-llmjp3":
 
99
  temperature,
100
  top_p,
101
  max_new_tokens,
102
+ api_base=f"{os.environ["LOCAL_LLM_URL"]}/{model_api_dict["api_end"]}/",
103
+ api_key=os.environ[model_api_dict["env_api_key"]],
104
  stop="<|im_end|>",
105
  )
106
  elif model_api_dict["api_type"] == "openai_no_stream":