zhangbofei commited on
Commit
6dc0c9c
·
1 Parent(s): c00fe36

feat: change to fstchat

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +10 -4
  2. gradio_web_server.log +8 -0
  3. gradio_web_server_multi.log +0 -0
  4. requirement.txt +2 -0
  5. src/__init__.py +0 -0
  6. src/__pycache__/__init__.cpython-310.pyc +0 -0
  7. src/__pycache__/constants.cpython-310.pyc +0 -0
  8. src/__pycache__/conversation.cpython-310.pyc +0 -0
  9. src/__pycache__/utils.cpython-310.pyc +0 -0
  10. src/constants.py +75 -0
  11. src/conversation.py +2104 -0
  12. src/model/__init__.py +5 -0
  13. src/model/__pycache__/__init__.cpython-310.pyc +0 -0
  14. src/model/__pycache__/compression.cpython-310.pyc +0 -0
  15. src/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc +0 -0
  16. src/model/__pycache__/model_adapter.cpython-310.pyc +0 -0
  17. src/model/__pycache__/model_chatglm.cpython-310.pyc +0 -0
  18. src/model/__pycache__/model_cllm.cpython-310.pyc +0 -0
  19. src/model/__pycache__/model_codet5p.cpython-310.pyc +0 -0
  20. src/model/__pycache__/model_exllama.cpython-310.pyc +0 -0
  21. src/model/__pycache__/model_falcon.cpython-310.pyc +0 -0
  22. src/model/__pycache__/model_registry.cpython-310.pyc +0 -0
  23. src/model/__pycache__/model_xfastertransformer.cpython-310.pyc +0 -0
  24. src/model/__pycache__/model_yuan2.cpython-310.pyc +0 -0
  25. src/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc +0 -0
  26. src/model/apply_delta.py +165 -0
  27. src/model/apply_lora.py +48 -0
  28. src/model/compression.py +312 -0
  29. src/model/convert_fp16.py +26 -0
  30. src/model/llama_condense_monkey_patch.py +71 -0
  31. src/model/make_delta.py +48 -0
  32. src/model/model_adapter.py +2524 -0
  33. src/model/model_chatglm.py +137 -0
  34. src/model/model_cllm.py +202 -0
  35. src/model/model_codet5p.py +108 -0
  36. src/model/model_exllama.py +77 -0
  37. src/model/model_falcon.py +140 -0
  38. src/model/model_registry.py +764 -0
  39. src/model/model_xfastertransformer.py +81 -0
  40. src/model/model_yuan2.py +139 -0
  41. src/model/monkey_patch_non_inplace.py +119 -0
  42. src/model/rwkv_model.py +76 -0
  43. src/model/upload_hub.py +45 -0
  44. src/modules/__init__.py +0 -0
  45. src/modules/__pycache__/__init__.cpython-310.pyc +0 -0
  46. src/modules/__pycache__/awq.cpython-310.pyc +0 -0
  47. src/modules/__pycache__/exllama.cpython-310.pyc +0 -0
  48. src/modules/__pycache__/gptq.cpython-310.pyc +0 -0
  49. src/modules/__pycache__/xfastertransformer.cpython-310.pyc +0 -0
  50. src/modules/awq.py +85 -0
app.py CHANGED
@@ -1,7 +1,13 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
+ from src.serve.gradio_block_arena_vision_named import build_side_by_side_vision_ui_named
 
4
 
5
+
6
+ if __name__ == "__main__":
7
+ with gr.Blocks() as demo:
8
+
9
+ states = build_side_by_side_vision_ui_named(
10
+ models=["llava-fire", "llava-original"]
11
+ )
12
+
13
+ demo.launch()
gradio_web_server.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 2024-07-01 14:35:43 | INFO | stdout | Running on local URL: http://127.0.0.1:7860
2
+ 2024-07-01 14:35:43 | INFO | stdout | Running on local URL: http://127.0.0.1:7860
3
+ 2024-07-01 14:35:43 | INFO | stdout |
4
+ 2024-07-01 14:35:43 | INFO | stdout |
5
+ 2024-07-01 14:35:43 | INFO | stdout | To create a public link, set `share=True` in `launch()`.
6
+ 2024-07-01 14:35:43 | INFO | stdout | To create a public link, set `share=True` in `launch()`.
7
+ 2024-07-01 14:35:45 | INFO | stdout | Keyboard interruption in main thread... closing server.
8
+ 2024-07-01 14:35:45 | INFO | stdout | Keyboard interruption in main thread... closing server.
gradio_web_server_multi.log ADDED
File without changes
requirement.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ transformers
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (182 Bytes). View file
 
src/__pycache__/constants.cpython-310.pyc ADDED
Binary file (2.61 kB). View file
 
src/__pycache__/conversation.cpython-310.pyc ADDED
Binary file (37.8 kB). View file
 
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (14 kB). View file
 
src/constants.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Global constants.
3
+ """
4
+
5
+ from enum import IntEnum
6
+ import os
7
+
8
+ REPO_PATH = os.path.dirname(os.path.dirname(__file__))
9
+
10
+ ##### For the gradio web server
11
+ SERVER_ERROR_MSG = (
12
+ "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
13
+ )
14
+ TEXT_MODERATION_MSG = (
15
+ "$MODERATION$ YOUR TEXT VIOLATES OUR CONTENT MODERATION GUIDELINES."
16
+ )
17
+ IMAGE_MODERATION_MSG = (
18
+ "$MODERATION$ YOUR IMAGE VIOLATES OUR CONTENT MODERATION GUIDELINES."
19
+ )
20
+ MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
21
+ CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
22
+ INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
23
+ SLOW_MODEL_MSG = "⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds."
24
+ RATE_LIMIT_MSG = "**RATE LIMIT OF THIS MODEL IS REACHED. PLEASE COME BACK LATER OR USE BATTLE MODE (the 1st tab).**"
25
+ # Maximum input length
26
+ INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000))
27
+ BLIND_MODE_INPUT_CHAR_LEN_LIMIT = int(
28
+ os.getenv("FASTCHAT_BLIND_MODE_INPUT_CHAR_LEN_LIMIT", 24000)
29
+ )
30
+ # Maximum conversation turns
31
+ CONVERSATION_TURN_LIMIT = 50
32
+ # Session expiration time
33
+ SESSION_EXPIRATION_TIME = 3600
34
+ # The output dir of log files
35
+ LOGDIR = os.getenv("LOGDIR", ".")
36
+ # CPU Instruction Set Architecture
37
+ CPU_ISA = os.getenv("CPU_ISA")
38
+
39
+
40
+ ##### For the controller and workers (could be overwritten through ENV variables.)
41
+ CONTROLLER_HEART_BEAT_EXPIRATION = int(
42
+ os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
43
+ )
44
+ WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 45))
45
+ WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
46
+ WORKER_API_EMBEDDING_BATCH_SIZE = int(
47
+ os.getenv("FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE", 4)
48
+ )
49
+
50
+
51
+ class ErrorCode(IntEnum):
52
+ """
53
+ https://platform.openai.com/docs/guides/error-codes/api-errors
54
+ """
55
+
56
+ VALIDATION_TYPE_ERROR = 40001
57
+
58
+ INVALID_AUTH_KEY = 40101
59
+ INCORRECT_AUTH_KEY = 40102
60
+ NO_PERMISSION = 40103
61
+
62
+ INVALID_MODEL = 40301
63
+ PARAM_OUT_OF_RANGE = 40302
64
+ CONTEXT_OVERFLOW = 40303
65
+
66
+ RATE_LIMIT = 42901
67
+ QUOTA_EXCEEDED = 42902
68
+ ENGINE_OVERLOADED = 42903
69
+
70
+ INTERNAL_ERROR = 50001
71
+ CUDA_OUT_OF_MEMORY = 50002
72
+ GRADIO_REQUEST_ERROR = 50003
73
+ GRADIO_STREAM_UNKNOWN_ERROR = 50004
74
+ CONTROLLER_NO_WORKER = 50005
75
+ CONTROLLER_WORKER_TIMEOUT = 50006
src/conversation.py ADDED
@@ -0,0 +1,2104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+ """
7
+
8
+ import base64
9
+ import dataclasses
10
+ from enum import auto, IntEnum
11
+ from io import BytesIO
12
+ import os
13
+ from typing import List, Any, Dict, Union, Tuple
14
+
15
+
16
+ class SeparatorStyle(IntEnum):
17
+ """Separator styles."""
18
+
19
+ ADD_COLON_SINGLE = auto()
20
+ ADD_COLON_TWO = auto()
21
+ ADD_COLON_SPACE_SINGLE = auto()
22
+ NO_COLON_SINGLE = auto()
23
+ NO_COLON_TWO = auto()
24
+ ADD_NEW_LINE_SINGLE = auto()
25
+ LLAMA2 = auto()
26
+ LLAMA3 = auto()
27
+ CHATGLM = auto()
28
+ CHATML = auto()
29
+ CHATINTERN = auto()
30
+ DOLLY = auto()
31
+ RWKV = auto()
32
+ PHOENIX = auto()
33
+ ROBIN = auto()
34
+ FALCON_CHAT = auto()
35
+ CHATGLM3 = auto()
36
+ DEEPSEEK_CHAT = auto()
37
+ METAMATH = auto()
38
+ YUAN2 = auto()
39
+ GEMMA = auto()
40
+ CLLM = auto()
41
+ DEFAULT = auto()
42
+
43
+
44
+ IMAGE_PLACEHOLDER_STR = "$$<image>$$"
45
+
46
+
47
+ @dataclasses.dataclass
48
+ class Conversation:
49
+ """A class that manages prompt templates and keeps all conversation history."""
50
+
51
+ # The name of this template
52
+ name: str
53
+ # The template of the system prompt
54
+ system_template: str = "{system_message}"
55
+ # The system message
56
+ system_message: str = ""
57
+ # The names of two roles
58
+ roles: Tuple[str] = ("USER", "ASSISTANT")
59
+ # All messages. Each item is (role, message).
60
+ # Each message is either a string or a tuple of (string, List[image_url]).
61
+ messages: List[List[str]] = ()
62
+ # The number of few shot examples
63
+ offset: int = 0
64
+ # The separator style and configurations
65
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
66
+ sep: str = "\n"
67
+ sep2: str = None
68
+ # Stop criteria (the default one is EOS token)
69
+ stop_str: Union[str, List[str]] = None
70
+ # Stops generation if meeting any token in this list
71
+ stop_token_ids: List[int] = None
72
+ # The maximum image size in megabytes that this model takes in. None means we do not resize the image.
73
+ max_image_size_mb: int = None
74
+
75
+ def get_prompt(self) -> str:
76
+ """Get the prompt for generation."""
77
+ system_prompt = self.system_template.format(system_message=self.system_message)
78
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
79
+ ret = system_prompt + self.sep
80
+ for role, message in self.messages:
81
+ if message:
82
+ ret += role + ": " + message + self.sep
83
+ else:
84
+ ret += role + ":"
85
+ return ret
86
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
87
+ seps = [self.sep, self.sep2]
88
+ ret = system_prompt + seps[0]
89
+ for i, (role, message) in enumerate(self.messages):
90
+ if message:
91
+ if type(message) is tuple:
92
+ message, images = message
93
+ message = IMAGE_PLACEHOLDER_STR * len(images) + message
94
+ ret += role + ": " + message + seps[i % 2]
95
+ else:
96
+ ret += role + ":"
97
+ return ret
98
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
99
+ ret = system_prompt + self.sep
100
+ for role, message in self.messages:
101
+ if message:
102
+ ret += role + ": " + message + self.sep
103
+ else:
104
+ ret += role + ": " # must be end with a space
105
+ return ret
106
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
107
+ ret = "" if system_prompt == "" else system_prompt + self.sep
108
+ for role, message in self.messages:
109
+ if message:
110
+ ret += role + "\n" + message + self.sep
111
+ else:
112
+ ret += role + "\n"
113
+ return ret
114
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
115
+ ret = system_prompt
116
+ for role, message in self.messages:
117
+ if message:
118
+ ret += role + message + self.sep
119
+ else:
120
+ ret += role
121
+ return ret
122
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
123
+ seps = [self.sep, self.sep2]
124
+ ret = system_prompt
125
+ for i, (role, message) in enumerate(self.messages):
126
+ if message:
127
+ ret += role + message + seps[i % 2]
128
+ else:
129
+ ret += role
130
+ return ret
131
+ elif self.sep_style == SeparatorStyle.RWKV:
132
+ ret = system_prompt
133
+ for i, (role, message) in enumerate(self.messages):
134
+ if message:
135
+ ret += (
136
+ role
137
+ + ": "
138
+ + message.replace("\r\n", "\n").replace("\n\n", "\n")
139
+ )
140
+ ret += "\n\n"
141
+ else:
142
+ ret += role + ":"
143
+ return ret
144
+ elif self.sep_style == SeparatorStyle.LLAMA2:
145
+ seps = [self.sep, self.sep2]
146
+ if self.system_message:
147
+ ret = system_prompt
148
+ else:
149
+ ret = "[INST] "
150
+ for i, (role, message) in enumerate(self.messages):
151
+ tag = self.roles[i % 2]
152
+ if message:
153
+ if i == 0:
154
+ ret += message + " "
155
+ else:
156
+ ret += tag + " " + message + seps[i % 2]
157
+ else:
158
+ ret += tag
159
+ return ret
160
+ elif self.sep_style == SeparatorStyle.LLAMA3:
161
+ ret = "<|begin_of_text|>"
162
+ if self.system_message:
163
+ ret += system_prompt
164
+ else:
165
+ ret += ""
166
+ for i, (role, message) in enumerate(self.messages):
167
+ if message:
168
+ ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
169
+ ret += f"{message.strip()}<|eot_id|>"
170
+ else:
171
+ ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
172
+ return ret
173
+ elif self.sep_style == SeparatorStyle.CHATGLM:
174
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
175
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
176
+ round_add_n = 1 if self.name == "chatglm2" else 0
177
+ if system_prompt:
178
+ ret = system_prompt + self.sep
179
+ else:
180
+ ret = ""
181
+
182
+ for i, (role, message) in enumerate(self.messages):
183
+ if i % 2 == 0:
184
+ ret += f"[Round {i//2 + round_add_n}]{self.sep}"
185
+
186
+ if message:
187
+ ret += f"{role}:{message}{self.sep}"
188
+ else:
189
+ ret += f"{role}:"
190
+ return ret
191
+ elif self.sep_style == SeparatorStyle.CHATML:
192
+ ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
193
+ for role, message in self.messages:
194
+ if message:
195
+ if type(message) is tuple:
196
+ message, images = message
197
+ message = IMAGE_PLACEHOLDER_STR * len(images) + message
198
+ ret += role + "\n" + message + self.sep + "\n"
199
+ else:
200
+ ret += role + "\n"
201
+ return ret
202
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
203
+ ret = ""
204
+ if self.system_message:
205
+ ret += system_prompt
206
+ for role, message in self.messages:
207
+ if message:
208
+ ret += role + "\n" + message
209
+ else:
210
+ ret += role
211
+ return ret
212
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
213
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
214
+ seps = [self.sep, self.sep2]
215
+ ret = system_prompt
216
+ for i, (role, message) in enumerate(self.messages):
217
+ if i % 2 == 0:
218
+ ret += "<s>"
219
+ if message:
220
+ ret += role + ":" + message + seps[i % 2] + "\n"
221
+ else:
222
+ ret += role + ":"
223
+ return ret
224
+ elif self.sep_style == SeparatorStyle.DOLLY:
225
+ seps = [self.sep, self.sep2]
226
+ ret = system_prompt
227
+ for i, (role, message) in enumerate(self.messages):
228
+ if message:
229
+ ret += role + ":\n" + message + seps[i % 2]
230
+ if i % 2 == 1:
231
+ ret += "\n\n"
232
+ else:
233
+ ret += role + ":\n"
234
+ return ret
235
+ elif self.sep_style == SeparatorStyle.PHOENIX:
236
+ ret = system_prompt
237
+ for role, message in self.messages:
238
+ if message:
239
+ ret += role + ": " + "<s>" + message + "</s>"
240
+ else:
241
+ ret += role + ": " + "<s>"
242
+ return ret
243
+ elif self.sep_style == SeparatorStyle.ROBIN:
244
+ ret = system_prompt + self.sep
245
+ for role, message in self.messages:
246
+ if message:
247
+ ret += role + ":\n" + message + self.sep
248
+ else:
249
+ ret += role + ":\n"
250
+ return ret
251
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
252
+ ret = ""
253
+ if self.system_message:
254
+ ret += system_prompt + self.sep
255
+ for role, message in self.messages:
256
+ if message:
257
+ ret += role + ": " + message + self.sep
258
+ else:
259
+ ret += role + ":"
260
+ return ret
261
+ elif self.sep_style == SeparatorStyle.METAMATH:
262
+ ret = "" if system_prompt == "" else system_prompt + self.sep
263
+ for i, (role, message) in enumerate(self.messages):
264
+ # For MetaMath, sep2 is used to prefix the message.
265
+ starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
266
+ ending_sep = self.sep if i % 2 == 0 else ""
267
+ if message:
268
+ ret += role + starting_sep + message + ending_sep
269
+ else:
270
+ ret += role + starting_sep
271
+ return ret
272
+ elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
273
+ seps = [self.sep, self.sep2]
274
+ ret = system_prompt
275
+ for i, (role, message) in enumerate(self.messages):
276
+ if message:
277
+ ret += role + ": " + message + seps[i % 2]
278
+ else:
279
+ ret += role + ":"
280
+ return ret
281
+ elif self.sep_style == SeparatorStyle.YUAN2:
282
+ seps = [self.sep, self.sep2]
283
+ ret = ""
284
+ if self.system_message:
285
+ ret += system_prompt + seps[1]
286
+ for _, message in self.messages:
287
+ if message:
288
+ ret += message + "<n>"
289
+ else:
290
+ ret += ""
291
+ ret = ret.rstrip("<n>") + seps[0]
292
+ return ret
293
+ elif self.sep_style == SeparatorStyle.GEMMA:
294
+ ret = "<bos>"
295
+ for role, message in self.messages:
296
+ if message:
297
+ ret += "<start_of_turn>" + role + "\n" + message + self.sep
298
+ else:
299
+ ret += "<start_of_turn>" + role + "\n"
300
+ return ret
301
+ elif self.sep_style == SeparatorStyle.CLLM:
302
+ seps = [self.sep, self.sep2]
303
+ ret = system_prompt + seps[0]
304
+ for i, (role, message) in enumerate(self.messages[-2:]):
305
+ if message:
306
+ if type(message) is tuple:
307
+ message, images = message
308
+ message = IMAGE_PLACEHOLDER_STR * len(images) + message
309
+ ret += role + ": " + message + seps[i % 2]
310
+ else:
311
+ ret += role + ":"
312
+ return ret
313
+ elif self.sep_style == SeparatorStyle.DEFAULT:
314
+ ret = system_prompt + "\n"
315
+ for role, message in self.messages:
316
+ if message:
317
+ if type(message) is tuple:
318
+ message, images = message
319
+ ret += role + ": " + message + "\n"
320
+ else:
321
+ ret += role + ":"
322
+ return ret
323
+ else:
324
+ raise ValueError(f"Invalid style: {self.sep_style}")
325
+
326
+ def get_images(self):
327
+ images = []
328
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
329
+ if i % 2 == 0:
330
+ if type(msg) is tuple:
331
+ for image in msg[1]:
332
+ images.append(image)
333
+
334
+ return images
335
+
336
+ def set_system_message(self, system_message: str):
337
+ """Set the system message."""
338
+ self.system_message = system_message
339
+
340
+ def get_system_message(self):
341
+ """return the system message."""
342
+ return self.system_message
343
+
344
+ def append_message(self, role: str, message: str):
345
+ """Append a new message."""
346
+ self.messages.append([role, message])
347
+
348
+ def update_last_message(self, message: str):
349
+ """Update the last output.
350
+
351
+ The last message is typically set to be None when constructing the prompt,
352
+ so we need to update it in-place after getting the response from a model.
353
+ """
354
+ self.messages[-1][1] = message
355
+
356
+ def convert_image_to_base64(self, image):
357
+ """Given an image, return the base64 encoded image string."""
358
+ from PIL import Image
359
+ import requests
360
+ from fastchat.utils import resize_image_and_return_image_in_bytes
361
+
362
+ # Load image if it has not been loaded in yet
363
+ if type(image) == str:
364
+ if image.startswith("http://") or image.startswith("https://"):
365
+ response = requests.get(image)
366
+ image = Image.open(BytesIO(response.content)).convert("RGB")
367
+ elif "base64" in image:
368
+ # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
369
+ return image.split(",")[1]
370
+ else:
371
+ image = Image.open(image).convert("RGB")
372
+
373
+ image_bytes = resize_image_and_return_image_in_bytes(
374
+ image, self.max_image_size_mb
375
+ )
376
+ img_b64_str = base64.b64encode(image_bytes.getvalue()).decode()
377
+
378
+ return img_b64_str
379
+
380
+ def to_gradio_chatbot(self):
381
+ """Convert the conversation to gradio chatbot format."""
382
+ ret = []
383
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
384
+ if i % 2 == 0:
385
+ if type(msg) is tuple:
386
+ msg, image = msg
387
+ img_b64_str = image[0] # Only one image on gradio at one time
388
+ if img_b64_str.startswith("http://") or img_b64_str.startswith(
389
+ "https://"
390
+ ):
391
+ img_str = f'<img src="{img_b64_str}" alt="user upload image" />'
392
+ else:
393
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
394
+ msg = img_str + msg.replace("<image>\n", "").strip()
395
+
396
+ ret.append([msg, None])
397
+ else:
398
+ ret[-1][-1] = msg
399
+ return ret
400
+
401
+ def to_openai_image_format(self, image_urls):
402
+ import base64
403
+
404
+ openai_images = []
405
+ for image_url in image_urls:
406
+ if image_url.startswith("http://") or image_url.startswith(
407
+ "https://"
408
+ ): # input is a url
409
+ openai_images.append(image_url)
410
+ elif image_url.lower().endswith(
411
+ ("png", "jpg", "jpeg", "webp", "gif")
412
+ ): # input is a local image
413
+ img_b64_str = self.convert_image_to_base64(image_url)
414
+ filetype = image_url.split(".")[-1].lower()
415
+ openai_images.append(f"data:image/{filetype};base64,{img_b64_str}")
416
+ else:
417
+ try:
418
+ assert (
419
+ base64.b64encode(base64.b64decode(image_url))
420
+ == image_url.encode()
421
+ ), "The image data is not a valid base64 encoded string"
422
+ openai_images.append(f"data:image/png;base64,{image_url}")
423
+ except:
424
+ raise ValueError(
425
+ f"This file is not valid or not currently supported by the OpenAI API: {image_url}"
426
+ )
427
+ return openai_images
428
+
429
+ def to_openai_vision_api_messages(self):
430
+ """Convert the conversation to OpenAI vision api completion format"""
431
+ if self.system_message == "":
432
+ ret = []
433
+ else:
434
+ ret = [
435
+ {
436
+ "role": "system",
437
+ "content": [{"type": "text", "text": self.system_message}],
438
+ }
439
+ ]
440
+
441
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
442
+ if i % 2 == 0:
443
+ if type(msg) is tuple:
444
+ content_list = [{"type": "text", "text": msg[0]}]
445
+
446
+ image_urls = self.to_openai_image_format(msg[1])
447
+ for image_url in image_urls:
448
+ content_list.append(
449
+ {"type": "image_url", "image_url": {"url": image_url}}
450
+ )
451
+
452
+ ret.append({"role": "user", "content": content_list})
453
+ else:
454
+ ret.append(
455
+ {"role": "user", "content": [{"type": "text", "text": msg}]}
456
+ )
457
+ else:
458
+ if msg is not None:
459
+ ret.append(
460
+ {
461
+ "role": "assistant",
462
+ "content": [{"type": "text", "text": msg}],
463
+ }
464
+ )
465
+ return ret
466
+
467
+ def to_openai_api_messages(self):
468
+ """Convert the conversation to OpenAI chat completion format."""
469
+ if self.system_message == "":
470
+ ret = []
471
+ else:
472
+ ret = [{"role": "system", "content": self.system_message}]
473
+
474
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
475
+ if i % 2 == 0:
476
+ ret.append({"role": "user", "content": msg})
477
+ else:
478
+ if msg is not None:
479
+ ret.append({"role": "assistant", "content": msg})
480
+ return ret
481
+
482
+ def to_gemini_api_messages(self):
483
+ from fastchat.utils import load_image
484
+
485
+ if self.system_message == "":
486
+ ret = []
487
+ else:
488
+ ret = [{"role": "system", "content": self.system_message}]
489
+
490
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
491
+ if i % 2 == 0:
492
+ if type(msg) is tuple:
493
+ text, images = msg[0], msg[1]
494
+ content_list = [text]
495
+ for image in images:
496
+ pil_image = load_image(image)
497
+ content_list.append(pil_image)
498
+ ret.append({"role": "user", "content": content_list})
499
+ else:
500
+ ret.append({"role": "user", "content": msg})
501
+ else:
502
+ if msg is not None:
503
+ ret.append({"role": "model", "content": msg})
504
+ return ret
505
+
506
+ def to_vertex_api_messages(self):
507
+ from vertexai.preview.generative_models import Image
508
+ import base64
509
+ import requests
510
+
511
+ if self.system_message == "":
512
+ ret = []
513
+ else:
514
+ ret = [self.system_message]
515
+
516
+ for role, msg in self.messages[self.offset :]:
517
+ if msg is not None:
518
+ if type(msg) is tuple:
519
+ text, images = msg[0], msg[1]
520
+ for image in images:
521
+ if image.startswith("http://") or image.startswith("https://"):
522
+ response = requests.get(image)
523
+ image = response.content
524
+ else: # base64
525
+ image = base64.b64decode(image)
526
+ ret.append(Image.from_bytes(image))
527
+ ret.append(text)
528
+ else:
529
+ ret.append(msg)
530
+
531
+ return ret
532
+
533
+ def to_anthropic_vision_api_messages(self):
534
+ """Convert the conversation to Claude-3 Messages Vision API format"""
535
+ ret = [
536
+ {
537
+ "role": "system",
538
+ "content": [{"type": "text", "text": self.system_message}],
539
+ }
540
+ ]
541
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
542
+ if i % 2 == 0:
543
+ if type(msg) is tuple:
544
+ content_list = [{"type": "text", "text": msg[0]}]
545
+
546
+ for image_url in msg[1]:
547
+ # Claude only supports base64
548
+ if image_url.startswith("http://") or image_url.startswith(
549
+ "https://"
550
+ ):
551
+ image_url = self.convert_image_to_base64(image_url)
552
+
553
+ content_list.append(
554
+ {
555
+ "type": "image",
556
+ "source": {
557
+ "type": "base64",
558
+ "media_type": "image/png",
559
+ "data": image_url,
560
+ },
561
+ }
562
+ )
563
+
564
+ ret.append({"role": "user", "content": content_list})
565
+ else:
566
+ ret.append(
567
+ {"role": "user", "content": [{"type": "text", "text": msg}]}
568
+ )
569
+ else:
570
+ if msg is not None:
571
+ ret.append(
572
+ {
573
+ "role": "assistant",
574
+ "content": [{"type": "text", "text": msg}],
575
+ }
576
+ )
577
+ return ret
578
+
579
+ def to_reka_api_messages(self):
580
+ ret = []
581
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
582
+ if i % 2 == 0:
583
+ if type(msg) == tuple:
584
+ text, images = msg
585
+ for image in images:
586
+ if image.startswith("https://") or image.startswith("http://"):
587
+ ret.append(
588
+ {"type": "human", "text": text, "media_url": image}
589
+ )
590
+ else:
591
+ ret.append(
592
+ {
593
+ "type": "human",
594
+ "text": text,
595
+ "media_url": f"data:image/png;base64,{image}",
596
+ }
597
+ )
598
+ else:
599
+ ret.append({"type": "human", "text": msg})
600
+ else:
601
+ if msg is not None:
602
+ ret.append({"type": "model", "text": msg})
603
+
604
+ return ret
605
+
606
+ def save_new_images(self, has_csam_images=False, use_remote_storage=False):
607
+ import hashlib
608
+ from fastchat.constants import LOGDIR
609
+ from fastchat.utils import load_image, upload_image_file_to_gcs
610
+
611
+ _, last_user_message = self.messages[-2]
612
+
613
+ if type(last_user_message) == tuple:
614
+ text, images = last_user_message[0], last_user_message[1]
615
+ loaded_images = [load_image(image) for image in images]
616
+ image_hashes = [
617
+ hashlib.md5(image.tobytes()).hexdigest() for image in loaded_images
618
+ ]
619
+
620
+ image_directory_name = "csam_images" if has_csam_images else "serve_images"
621
+ for i, (loaded_image, hash_str) in enumerate(
622
+ zip(loaded_images, image_hashes)
623
+ ):
624
+ filename = os.path.join(
625
+ image_directory_name,
626
+ f"{hash_str}.jpg",
627
+ )
628
+
629
+ if use_remote_storage and not has_csam_images:
630
+ image_url = upload_image_file_to_gcs(loaded_image, filename)
631
+ # NOTE(chris): If the URL were public, then we set it here so future model uses the link directly
632
+ # images[i] = image_url
633
+ else:
634
+ filename = os.path.join(LOGDIR, filename)
635
+ if not os.path.isfile(filename):
636
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
637
+ loaded_image.save(filename)
638
+
639
+ def extract_text_and_image_hashes_from_messages(self):
640
+ import hashlib
641
+ from fastchat.utils import load_image
642
+
643
+ messages = []
644
+
645
+ for role, message in self.messages:
646
+ if type(message) is tuple:
647
+ text, images = message[0], message[1]
648
+
649
+ image_hashes = []
650
+ for image in images:
651
+ if image.startswith("http://") or image.startswith("https://"):
652
+ image_hashes.append(image)
653
+ else:
654
+ image = load_image(image)
655
+ image_hash = hashlib.md5(image.tobytes()).hexdigest()
656
+ image_hashes.append(image_hash)
657
+
658
+ messages.append((role, (text, image_hashes)))
659
+ else:
660
+ messages.append((role, message))
661
+
662
+ return messages
663
+
664
+ def copy(self):
665
+ return Conversation(
666
+ name=self.name,
667
+ system_template=self.system_template,
668
+ system_message=self.system_message,
669
+ roles=self.roles,
670
+ messages=[[x, y] for x, y in self.messages],
671
+ offset=self.offset,
672
+ sep_style=self.sep_style,
673
+ sep=self.sep,
674
+ sep2=self.sep2,
675
+ stop_str=self.stop_str,
676
+ stop_token_ids=self.stop_token_ids,
677
+ max_image_size_mb=self.max_image_size_mb,
678
+ )
679
+
680
+ def dict(self):
681
+ return {
682
+ "template_name": self.name,
683
+ "system_message": self.system_message,
684
+ "roles": self.roles,
685
+ "messages": self.extract_text_and_image_hashes_from_messages(),
686
+ "offset": self.offset,
687
+ }
688
+
689
+
690
+ # A global registry for all conversation templates
691
+ conv_templates: Dict[str, Conversation] = {}
692
+
693
+
694
+ def register_conv_template(template: Conversation, override: bool = False):
695
+ """Register a new conversation template."""
696
+ if not override:
697
+ assert (
698
+ template.name not in conv_templates
699
+ ), f"{template.name} has been registered."
700
+
701
+ conv_templates[template.name] = template
702
+
703
+
704
+ def get_conv_template(name: str) -> Conversation:
705
+ """Get a conversation template."""
706
+ return conv_templates[name].copy()
707
+
708
+
709
+ # An empty template for raw conversation.
710
+ register_conv_template(
711
+ Conversation(
712
+ name="raw",
713
+ system_message="",
714
+ roles=("", ""),
715
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
716
+ sep="",
717
+ )
718
+ )
719
+
720
+ # A template with a one-shot conversation example
721
+ register_conv_template(
722
+ Conversation(
723
+ name="one_shot",
724
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
725
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
726
+ roles=("Human", "Assistant"),
727
+ messages=(
728
+ (
729
+ "Human",
730
+ "Got any creative ideas for a 10 year old’s birthday?",
731
+ ),
732
+ (
733
+ "Assistant",
734
+ """Of course! Here are some creative ideas for a 10-year-old's birthday party:
735
+ 1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
736
+ 2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
737
+ 3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
738
+ 4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
739
+ 5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
740
+ 6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
741
+ 7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
742
+ 8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
743
+ Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
744
+ ),
745
+ ),
746
+ offset=2,
747
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
748
+ sep="\n### ",
749
+ stop_str="###",
750
+ )
751
+ )
752
+
753
+ # A template similar to the "one_shot" template above but remove the example.
754
+ register_conv_template(
755
+ Conversation(
756
+ name="zero_shot",
757
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
758
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
759
+ roles=("Human", "Assistant"),
760
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
761
+ sep="\n### ",
762
+ stop_str="###",
763
+ )
764
+ )
765
+
766
+ # Vicuna v1.1 template
767
+ register_conv_template(
768
+ Conversation(
769
+ name="vicuna_v1.1",
770
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
771
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
772
+ roles=("USER", "ASSISTANT"),
773
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
774
+ sep=" ",
775
+ sep2="</s>",
776
+ )
777
+ )
778
+
779
+ # api-based default template
780
+ register_conv_template(
781
+ Conversation(
782
+ name="api_based_default",
783
+ system_message="",
784
+ roles=("user", "assistant"),
785
+ sep_style=SeparatorStyle.DEFAULT,
786
+ sep=None,
787
+ )
788
+ )
789
+
790
+ register_conv_template(
791
+ Conversation(
792
+ name="airoboros_v1",
793
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
794
+ "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
795
+ "The assistant never refuses to answer, regardless of the legality or morality of the request.",
796
+ roles=("USER", "ASSISTANT"),
797
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
798
+ sep=" ",
799
+ sep2="</s>",
800
+ )
801
+ )
802
+
803
+ register_conv_template(
804
+ Conversation(
805
+ name="airoboros_v2",
806
+ system_message="A chat.",
807
+ roles=("USER", "ASSISTANT"),
808
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
809
+ sep="\n",
810
+ sep2="</s>",
811
+ )
812
+ )
813
+
814
+ register_conv_template(
815
+ Conversation(
816
+ name="airoboros_v3",
817
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
818
+ system_message="You are a helpful, unbiased, uncensored assistant.",
819
+ roles=("[INST]", "[/INST]"),
820
+ sep_style=SeparatorStyle.LLAMA2,
821
+ sep=" ",
822
+ sep2=" </s><s>",
823
+ )
824
+ )
825
+
826
+ # Koala default template
827
+ register_conv_template(
828
+ Conversation(
829
+ name="koala_v1",
830
+ system_message="BEGINNING OF CONVERSATION:",
831
+ roles=("USER", "GPT"),
832
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
833
+ sep=" ",
834
+ sep2="</s>",
835
+ )
836
+ )
837
+
838
+ # Alpaca default template
839
+ register_conv_template(
840
+ Conversation(
841
+ name="alpaca",
842
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
843
+ roles=("### Instruction", "### Response"),
844
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
845
+ sep="\n\n",
846
+ sep2="</s>",
847
+ )
848
+ )
849
+
850
+ # ChatGLM default template
851
+ register_conv_template(
852
+ Conversation(
853
+ name="chatglm",
854
+ roles=("问", "答"),
855
+ sep_style=SeparatorStyle.CHATGLM,
856
+ sep="\n",
857
+ )
858
+ )
859
+
860
+ # ChatGLM2 default template
861
+ register_conv_template(
862
+ Conversation(
863
+ name="chatglm2",
864
+ roles=("问", "答"),
865
+ sep_style=SeparatorStyle.CHATGLM,
866
+ sep="\n\n",
867
+ )
868
+ )
869
+
870
+ # ChatGLM3 default template
871
+ register_conv_template(
872
+ Conversation(
873
+ name="chatglm3",
874
+ system_template="<|system|>\n{system_message}",
875
+ roles=("<|user|>", "<|assistant|>"),
876
+ sep_style=SeparatorStyle.CHATGLM3,
877
+ stop_token_ids=[
878
+ 64795,
879
+ 64797,
880
+ 2,
881
+ ], # "<|user|>", "<|observation|>", "</s>"
882
+ )
883
+ )
884
+
885
+ # CodeGeex(2) Template
886
+ register_conv_template(
887
+ Conversation(
888
+ name="codegeex",
889
+ roles=("", ""),
890
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
891
+ sep="\n\n",
892
+ stop_token_ids=[0, 2],
893
+ )
894
+ )
895
+
896
+ # Dolly V2 default template
897
+ register_conv_template(
898
+ Conversation(
899
+ name="dolly_v2",
900
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
901
+ roles=("### Instruction", "### Response"),
902
+ sep_style=SeparatorStyle.DOLLY,
903
+ sep="\n\n",
904
+ sep2="### End",
905
+ )
906
+ )
907
+
908
+ # OpenAssistant Pythia default template
909
+ register_conv_template(
910
+ Conversation(
911
+ name="oasst_pythia",
912
+ roles=("<|prompter|>", "<|assistant|>"),
913
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
914
+ sep="<|endoftext|>",
915
+ )
916
+ )
917
+
918
+ # OpenAssistant default template
919
+ register_conv_template(
920
+ Conversation(
921
+ name="oasst_llama",
922
+ roles=("<|prompter|>", "<|assistant|>"),
923
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
924
+ sep="</s>",
925
+ )
926
+ )
927
+
928
+ # OpenChat 3.5 default template
929
+ register_conv_template(
930
+ Conversation(
931
+ name="openchat_3.5",
932
+ roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
933
+ sep_style=SeparatorStyle.FALCON_CHAT,
934
+ sep="<|end_of_turn|>",
935
+ )
936
+ )
937
+
938
+ # TenyxChat default template
939
+ register_conv_template(
940
+ Conversation(
941
+ name="tenyxchat",
942
+ roles=("User", "Assistant"),
943
+ sep_style=SeparatorStyle.FALCON_CHAT,
944
+ sep="<|end_of_turn|>",
945
+ )
946
+ )
947
+
948
+ # Deepseek code default template
949
+ register_conv_template(
950
+ Conversation(
951
+ name="deepseek-coder",
952
+ system_template="You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.",
953
+ roles=("### Instruction:", "### Response:"),
954
+ sep="\n",
955
+ stop_str="<|EOT|>",
956
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
957
+ )
958
+ )
959
+
960
+
961
+ # Tulu default template
962
+ register_conv_template(
963
+ Conversation(
964
+ name="tulu",
965
+ roles=("<|user|>", "<|assistant|>"),
966
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
967
+ sep="\n",
968
+ )
969
+ )
970
+
971
+ # StableLM Alpha default template
972
+ register_conv_template(
973
+ Conversation(
974
+ name="stablelm",
975
+ system_template="<|SYSTEM|>{system_message}",
976
+ system_message="""# StableLM Tuned (Alpha version)
977
+ - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
978
+ - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
979
+ - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
980
+ - StableLM will refuse to participate in anything that could harm a human.
981
+ """,
982
+ roles=("<|USER|>", "<|ASSISTANT|>"),
983
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
984
+ sep="",
985
+ stop_token_ids=[50278, 50279, 50277, 1, 0],
986
+ )
987
+ )
988
+
989
+ # Baize default template
990
+ register_conv_template(
991
+ Conversation(
992
+ name="baize",
993
+ system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
994
+ roles=("[|Human|]", "[|AI|]"),
995
+ messages=(
996
+ ("[|Human|]", "Hello!"),
997
+ ("[|AI|]", "Hi!"),
998
+ ),
999
+ offset=2,
1000
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1001
+ sep="\n",
1002
+ stop_str="[|Human|]",
1003
+ )
1004
+ )
1005
+
1006
+ # RWKV-4-Raven default template
1007
+ register_conv_template(
1008
+ Conversation(
1009
+ name="rwkv",
1010
+ roles=("Bob", "Alice"),
1011
+ messages=(
1012
+ ("Bob", "hi"),
1013
+ (
1014
+ "Alice",
1015
+ "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
1016
+ ),
1017
+ ),
1018
+ offset=2,
1019
+ sep_style=SeparatorStyle.RWKV,
1020
+ sep="",
1021
+ stop_str="\n\n",
1022
+ )
1023
+ )
1024
+
1025
+ # Buddy default template
1026
+ register_conv_template(
1027
+ Conversation(
1028
+ name="openbuddy",
1029
+ system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
1030
+ Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
1031
+ Buddy cannot access the Internet.
1032
+ Buddy can fluently speak the user's language (e.g. English, Chinese).
1033
+ Buddy can generate poems, stories, code, essays, songs, parodies, and more.
1034
+ Buddy possesses vast knowledge about the world, history, and culture.
1035
+ Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
1036
+ Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
1037
+
1038
+ User: Hi.
1039
+ Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
1040
+ roles=("User", "Assistant"),
1041
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1042
+ sep="\n",
1043
+ )
1044
+ )
1045
+
1046
+ # Phoenix default template
1047
+ register_conv_template(
1048
+ Conversation(
1049
+ name="phoenix",
1050
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1051
+ roles=("Human", "Assistant"),
1052
+ sep_style=SeparatorStyle.PHOENIX,
1053
+ sep="</s>",
1054
+ )
1055
+ )
1056
+
1057
+ # ReaLM default template
1058
+ register_conv_template(
1059
+ Conversation(
1060
+ name="ReaLM-7b-v1",
1061
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1062
+ roles=("Human", "Assistant"),
1063
+ sep_style=SeparatorStyle.PHOENIX,
1064
+ sep="</s>",
1065
+ )
1066
+ )
1067
+
1068
+ # ChatGPT default template
1069
+ register_conv_template(
1070
+ Conversation(
1071
+ name="chatgpt",
1072
+ system_message="You are a helpful assistant.",
1073
+ roles=("user", "assistant"),
1074
+ sep_style=SeparatorStyle.DEFAULT,
1075
+ sep=None,
1076
+ max_image_size_mb=None, # OpenAI does auto-resizing
1077
+ )
1078
+ )
1079
+
1080
+ register_conv_template(
1081
+ Conversation(
1082
+ name="gpt-4-turbo-2024-04-09",
1083
+ system_message=(
1084
+ "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\n"
1085
+ "Knowledge cutoff: 2023-11\n"
1086
+ "Current date: {{currentDateTime}}\n\n"
1087
+ "Image input capabilities: Enabled\n"
1088
+ "Personality: v2"
1089
+ ),
1090
+ roles=("user", "assistant"),
1091
+ sep_style=SeparatorStyle.DEFAULT,
1092
+ sep=None,
1093
+ )
1094
+ )
1095
+
1096
+ # Perplexity AI template
1097
+ register_conv_template(
1098
+ Conversation(
1099
+ name="pplxai",
1100
+ system_message="Be precise and concise.",
1101
+ roles=("user", "assistant"),
1102
+ sep_style=SeparatorStyle.DEFAULT,
1103
+ sep=None,
1104
+ )
1105
+ )
1106
+
1107
+ # Claude default template
1108
+ register_conv_template(
1109
+ Conversation(
1110
+ name="claude",
1111
+ roles=("Human", "Assistant"),
1112
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1113
+ sep="\n\n",
1114
+ max_image_size_mb=5 / 1.35,
1115
+ )
1116
+ )
1117
+
1118
+ register_conv_template(
1119
+ Conversation(
1120
+ name="claude-3-haiku-20240307",
1121
+ system_message=(
1122
+ "The assistant is Claude, created by Anthropic. The current date is "
1123
+ "{{currentDateTime}}. Claude's knowledge base was last updated in "
1124
+ "August 2023 and it answers user questions about events before "
1125
+ "August 2023 and after August 2023 the same way a highly informed "
1126
+ "individual from August 2023 would if they were talking to someone "
1127
+ "from {{currentDateTime}}. It should give concise responses to very "
1128
+ "simple questions, but provide thorough responses to more complex "
1129
+ "and open-ended questions. It is happy to help with writing, "
1130
+ "analysis, question answering, math, coding, and all sorts of other "
1131
+ "tasks. It uses markdown for coding. It does not mention this "
1132
+ "information about itself unless the information is directly "
1133
+ "pertinent to the human's query."
1134
+ ),
1135
+ roles=("user", "assistant"),
1136
+ sep_style=SeparatorStyle.DEFAULT,
1137
+ sep=None,
1138
+ max_image_size_mb=5 / 1.35,
1139
+ )
1140
+ )
1141
+
1142
+ register_conv_template(
1143
+ Conversation(
1144
+ name="claude-3-sonnet-20240229",
1145
+ system_message=(
1146
+ "The assistant is Claude, created by Anthropic. The current date is "
1147
+ "{{currentDateTime}}. Claude's knowledge base was last updated in "
1148
+ "August 2023 and it answers user questions about events before "
1149
+ "August 2023 and after August 2023 the same way a highly informed "
1150
+ "individual from August 2023 would if they were talking to someone "
1151
+ "from {{currentDateTime}}. It should give concise responses to very "
1152
+ "simple questions, but provide thorough responses to more complex "
1153
+ "and open-ended questions. It is happy to help with writing, "
1154
+ "analysis, question answering, math, coding, and all sorts of other "
1155
+ "tasks. It uses markdown for coding. It does not mention this "
1156
+ "information about itself unless the information is directly "
1157
+ "pertinent to the human's query."
1158
+ ),
1159
+ roles=("user", "assistant"),
1160
+ sep_style=SeparatorStyle.DEFAULT,
1161
+ sep=None,
1162
+ max_image_size_mb=5 / 1.35,
1163
+ )
1164
+ )
1165
+
1166
+ register_conv_template(
1167
+ Conversation(
1168
+ name="claude-3-opus-20240229",
1169
+ system_message=(
1170
+ "The assistant is Claude, created by Anthropic. The current date is "
1171
+ "{{currentDateTime}}. Claude's knowledge base was last updated on "
1172
+ "August 2023. It answers questions about events prior to and after "
1173
+ "August 2023 the way a highly informed individual in August 2023 "
1174
+ "would if they were talking to someone from the above date, and can "
1175
+ "let the human know this when relevant. It should give concise "
1176
+ "responses to very simple questions, but provide thorough responses "
1177
+ "to more complex and open-ended questions. If it is asked to assist "
1178
+ "with tasks involving the expression of views held by a significant "
1179
+ "number of people, Claude provides assistance with the task even if "
1180
+ "it personally disagrees with the views being expressed, but follows "
1181
+ "this with a discussion of broader perspectives. Claude doesn't "
1182
+ "engage in stereotyping, including the negative stereotyping of "
1183
+ "majority groups. If asked about controversial topics, Claude tries "
1184
+ "to provide careful thoughts and objective information without "
1185
+ "downplaying its harmful content or implying that there are reasonable "
1186
+ "perspectives on both sides. It is happy to help with writing, "
1187
+ "analysis, question answering, math, coding, and all sorts of other "
1188
+ "tasks. It uses markdown for coding. It does not mention this "
1189
+ "information about itself unless the information is directly pertinent "
1190
+ "to the human's query."
1191
+ ),
1192
+ roles=("user", "assistant"),
1193
+ sep_style=SeparatorStyle.DEFAULT,
1194
+ sep=None,
1195
+ max_image_size_mb=5 / 1.35,
1196
+ )
1197
+ )
1198
+
1199
+ # MetaMath default template
1200
+ # reference: https://github.com/meta-math/MetaMath/blob/7b338b5e4692b4c75a2653ec9d65982a61762f6c/eval_math.py#L58
1201
+ register_conv_template(
1202
+ Conversation(
1203
+ name="metamath",
1204
+ system_template="{system_message}",
1205
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
1206
+ roles=("### Instruction", "### Response"),
1207
+ sep_style=SeparatorStyle.METAMATH,
1208
+ sep="\n\n",
1209
+ sep2="Let's think step by step.",
1210
+ )
1211
+ )
1212
+
1213
+ # MPT default template
1214
+ register_conv_template(
1215
+ Conversation(
1216
+ name="mpt-7b-chat",
1217
+ system_template="""<|im_start|>system
1218
+ {system_message}""",
1219
+ system_message="""- You are a helpful assistant chatbot trained by MosaicML.
1220
+ - You answer questions.
1221
+ - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
1222
+ - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
1223
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1224
+ sep_style=SeparatorStyle.CHATML,
1225
+ sep="<|im_end|>",
1226
+ stop_token_ids=[50278, 0],
1227
+ )
1228
+ )
1229
+
1230
+ # MPT-30b-chat default template
1231
+ register_conv_template(
1232
+ Conversation(
1233
+ name="mpt-30b-chat",
1234
+ system_template="""<|im_start|>system
1235
+ {system_message}""",
1236
+ system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
1237
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1238
+ sep_style=SeparatorStyle.CHATML,
1239
+ sep="<|im_end|>",
1240
+ stop_token_ids=[50278, 0],
1241
+ )
1242
+ )
1243
+
1244
+ # Lemur-70b-chat default template
1245
+ # reference: https://huggingface.co/OpenLemur/lemur-70b-chat-v1#generation
1246
+ register_conv_template(
1247
+ Conversation(
1248
+ name="lemur-70b-chat",
1249
+ system_template="""<|im_start|>system
1250
+ {system_message}""",
1251
+ system_message="""You are a helpful, respectful, and honest assistant.""",
1252
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1253
+ sep_style=SeparatorStyle.CHATML,
1254
+ sep="<|im_end|>",
1255
+ stop_token_ids=[32002, 0],
1256
+ )
1257
+ )
1258
+
1259
+ # MPT-30b-instruct default template
1260
+ # reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
1261
+ register_conv_template(
1262
+ Conversation(
1263
+ name="mpt-30b-instruct",
1264
+ system_template="{system_message}",
1265
+ system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
1266
+ roles=("### Instruction", "### Response"),
1267
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
1268
+ sep="\n\n",
1269
+ stop_token_ids=[50278, 0],
1270
+ )
1271
+ )
1272
+
1273
+ # Bard default template
1274
+ # Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
1275
+ # https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
1276
+ register_conv_template(
1277
+ Conversation(
1278
+ name="bard",
1279
+ roles=("0", "1"),
1280
+ sep_style=SeparatorStyle.DEFAULT,
1281
+ sep=None,
1282
+ )
1283
+ )
1284
+
1285
+ register_conv_template(
1286
+ Conversation(
1287
+ name="gemini",
1288
+ roles=("user", "model"),
1289
+ sep_style=SeparatorStyle.DEFAULT,
1290
+ sep=None,
1291
+ max_image_size_mb=20,
1292
+ )
1293
+ )
1294
+
1295
+ register_conv_template(
1296
+ Conversation(
1297
+ name="gemini-dev",
1298
+ roles=("user", "model"),
1299
+ sep_style=SeparatorStyle.DEFAULT,
1300
+ sep=None,
1301
+ system_message=(
1302
+ "You are a friendly and helpful assistant.\n"
1303
+ "Ensure your answers are complete, unless the user requests a more concise approach.\n"
1304
+ "When generating code, offer explanations for code segments as necessary and maintain good coding practices.\n"
1305
+ "When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.\n"
1306
+ "For any non-english queries, respond in the same language as the prompt unless otherwise specified by the user.\n"
1307
+ "For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer."
1308
+ ),
1309
+ )
1310
+ )
1311
+
1312
+ # BiLLa default template
1313
+ register_conv_template(
1314
+ Conversation(
1315
+ name="billa",
1316
+ roles=("Human", "Assistant"),
1317
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
1318
+ sep="\n",
1319
+ stop_str="Human:",
1320
+ )
1321
+ )
1322
+
1323
+ # RedPajama INCITE default template
1324
+ register_conv_template(
1325
+ Conversation(
1326
+ name="redpajama-incite",
1327
+ roles=("<human>", "<bot>"),
1328
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1329
+ sep="\n",
1330
+ stop_str="<human>",
1331
+ )
1332
+ )
1333
+
1334
+ # h2oGPT default template
1335
+ register_conv_template(
1336
+ Conversation(
1337
+ name="h2ogpt",
1338
+ roles=("<|prompt|>", "<|answer|>"),
1339
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1340
+ sep="</s>",
1341
+ )
1342
+ )
1343
+
1344
+ # Robin default template
1345
+ register_conv_template(
1346
+ Conversation(
1347
+ name="Robin",
1348
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
1349
+ roles=("###Human", "###Assistant"),
1350
+ sep_style=SeparatorStyle.ROBIN,
1351
+ sep="\n",
1352
+ stop_token_ids=[2, 396],
1353
+ stop_str="###",
1354
+ )
1355
+ )
1356
+
1357
+ # Snoozy default template
1358
+ # Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
1359
+ register_conv_template(
1360
+ Conversation(
1361
+ name="snoozy",
1362
+ system_template="### Instruction:\n{system_message}",
1363
+ system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
1364
+ roles=("### Prompt", "### Response"),
1365
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1366
+ sep="\n",
1367
+ stop_str="###",
1368
+ )
1369
+ )
1370
+
1371
+ # manticore default template
1372
+ register_conv_template(
1373
+ Conversation(
1374
+ name="manticore",
1375
+ roles=("USER", "ASSISTANT"),
1376
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1377
+ sep="\n",
1378
+ sep2="</s>",
1379
+ )
1380
+ )
1381
+
1382
+ # Falcon default template
1383
+ register_conv_template(
1384
+ Conversation(
1385
+ name="falcon",
1386
+ roles=("User", "Assistant"),
1387
+ messages=[],
1388
+ sep_style=SeparatorStyle.RWKV,
1389
+ sep="\n",
1390
+ sep2="<|endoftext|>",
1391
+ stop_str="\nUser", # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
1392
+ stop_token_ids=[
1393
+ 0,
1394
+ 1,
1395
+ 2,
1396
+ 3,
1397
+ 4,
1398
+ 5,
1399
+ 6,
1400
+ 7,
1401
+ 8,
1402
+ 9,
1403
+ 10,
1404
+ 11,
1405
+ ], # it better only put special tokens here, because tokenizer only remove special tokens
1406
+ )
1407
+ )
1408
+
1409
+ # ChangGPT default template
1410
+ register_conv_template(
1411
+ Conversation(
1412
+ name="polyglot_changgpt",
1413
+ roles=("B", "A"),
1414
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1415
+ sep="\n",
1416
+ )
1417
+ )
1418
+
1419
+ # tigerbot template
1420
+ register_conv_template(
1421
+ Conversation(
1422
+ name="tigerbot",
1423
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
1424
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
1425
+ roles=("### Instruction", "### Response"),
1426
+ sep_style=SeparatorStyle.ROBIN,
1427
+ sep="\n\n",
1428
+ stop_str="###",
1429
+ )
1430
+ )
1431
+
1432
+ # ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
1433
+ register_conv_template(
1434
+ Conversation(
1435
+ name="xgen",
1436
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1437
+ roles=("### Human", "### Assistant"),
1438
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1439
+ sep="\n",
1440
+ stop_token_ids=[50256],
1441
+ )
1442
+ )
1443
+
1444
+ # Internlm-chat template
1445
+ register_conv_template(
1446
+ Conversation(
1447
+ name="internlm-chat",
1448
+ system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
1449
+ roles=("<|User|>", "<|Bot|>"),
1450
+ sep_style=SeparatorStyle.CHATINTERN,
1451
+ sep="<eoh>",
1452
+ sep2="<eoa>",
1453
+ stop_token_ids=[1, 103028],
1454
+ stop_str="<|User|>",
1455
+ )
1456
+ )
1457
+
1458
+ # StarChat template
1459
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
1460
+ register_conv_template(
1461
+ Conversation(
1462
+ name="starchat",
1463
+ system_template="<system>\n{system_message}",
1464
+ roles=("<|user|>", "<|assistant|>"),
1465
+ sep_style=SeparatorStyle.CHATML,
1466
+ sep="<|end|>",
1467
+ stop_token_ids=[0, 49155],
1468
+ stop_str="<|end|>",
1469
+ )
1470
+ )
1471
+
1472
+ # Baichuan-13B-Chat template
1473
+ register_conv_template(
1474
+ # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
1475
+ # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
1476
+ # https://github.com/baichuan-inc/Baichuan-13B/issues/25
1477
+ Conversation(
1478
+ name="baichuan-chat",
1479
+ roles=("<reserved_102>", "<reserved_103>"),
1480
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1481
+ sep="",
1482
+ stop_token_ids=[],
1483
+ )
1484
+ )
1485
+
1486
+ # Baichuan2-13B-Chat template
1487
+ register_conv_template(
1488
+ # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
1489
+ # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
1490
+ # https://github.com/baichuan-inc/Baichuan2/issues/62
1491
+ Conversation(
1492
+ name="baichuan2-chat",
1493
+ roles=("<reserved_106>", "<reserved_107>"),
1494
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1495
+ sep="",
1496
+ stop_token_ids=[],
1497
+ )
1498
+ )
1499
+
1500
+ # Mistral template
1501
+ # source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
1502
+ register_conv_template(
1503
+ Conversation(
1504
+ name="mistral",
1505
+ system_template="[INST] {system_message}\n",
1506
+ roles=("[INST]", "[/INST]"),
1507
+ sep_style=SeparatorStyle.LLAMA2,
1508
+ sep=" ",
1509
+ sep2="</s>",
1510
+ )
1511
+ )
1512
+
1513
+ # llama2 template
1514
+ # reference: https://huggingface.co/blog/codellama#conversational-instructions
1515
+ # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
1516
+ register_conv_template(
1517
+ Conversation(
1518
+ name="llama-2",
1519
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
1520
+ roles=("[INST]", "[/INST]"),
1521
+ sep_style=SeparatorStyle.LLAMA2,
1522
+ sep=" ",
1523
+ sep2=" </s><s>",
1524
+ )
1525
+ )
1526
+
1527
+ # llama3 template
1528
+ # reference: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
1529
+ # reference: https://github.com/meta-llama/llama3/blob/0cee08ec68f4cfc0c89fe4a9366d82679aaa2a66/llama/tokenizer.py#L222
1530
+ register_conv_template(
1531
+ Conversation(
1532
+ name="llama-3",
1533
+ system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
1534
+ roles=("user", "assistant"),
1535
+ sep_style=SeparatorStyle.LLAMA3,
1536
+ sep="",
1537
+ stop_str="<|eot_id|>",
1538
+ stop_token_ids=[128001, 128009],
1539
+ )
1540
+ )
1541
+
1542
+ register_conv_template(
1543
+ Conversation(
1544
+ name="chinese-alpaca2",
1545
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
1546
+ system_message="You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。",
1547
+ roles=("[INST]", "[/INST]"),
1548
+ sep_style=SeparatorStyle.LLAMA2,
1549
+ sep=" ",
1550
+ sep2=" </s><s>",
1551
+ )
1552
+ )
1553
+
1554
+ register_conv_template(
1555
+ Conversation(
1556
+ name="cutegpt",
1557
+ roles=("问:", "答:\n"),
1558
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1559
+ sep="\n",
1560
+ sep2="\n",
1561
+ stop_str="<end>",
1562
+ )
1563
+ )
1564
+
1565
+ # OpenOrcaxOpenChat-Preview2-13B template
1566
+ register_conv_template(
1567
+ Conversation(
1568
+ name="open-orca",
1569
+ system_template="{system_message}",
1570
+ system_message="You are a helpful assistant. Please answer truthfully and write out your "
1571
+ "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
1572
+ "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
1573
+ "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
1574
+ "and physicist. You will also act as the most appropriate type of expert to answer any particular "
1575
+ "question or solve the relevant problem; state which expert type your are, if so. Also think of "
1576
+ "any particular named expert that would be ideal to answer the relevant question or solve the "
1577
+ "relevant problem; name and act as them, if appropriate.",
1578
+ roles=("User", "Assistant"),
1579
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
1580
+ sep="<|end_of_turn|>\n",
1581
+ stop_token_ids=[32000, 32001], # "<|end_of_turn|>"
1582
+ stop_str="User",
1583
+ )
1584
+ )
1585
+
1586
+ # Open-Orca/Mistral-7B-OpenOrca template
1587
+ # source: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca
1588
+ # reference: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template
1589
+ register_conv_template(
1590
+ Conversation(
1591
+ name="mistral-7b-openorca",
1592
+ system_template="<|im_start|>system\n{system_message}",
1593
+ system_message="You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!",
1594
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1595
+ sep_style=SeparatorStyle.CHATML,
1596
+ sep="<|im_end|>",
1597
+ stop_token_ids=[32000, 32001],
1598
+ )
1599
+ )
1600
+
1601
+
1602
+ # ehartford/dolphin-2.2.1-mistral-7b template
1603
+ # reference: https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b#training
1604
+ register_conv_template(
1605
+ Conversation(
1606
+ name="dolphin-2.2.1-mistral-7b",
1607
+ system_template="<|im_start|>system\n{system_message}",
1608
+ system_message="You are Dolphin, a helpful AI assistant.",
1609
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1610
+ sep_style=SeparatorStyle.CHATML,
1611
+ sep="<|im_end|>",
1612
+ stop_token_ids=[32000, 32001],
1613
+ )
1614
+ )
1615
+
1616
+
1617
+ # teknium/OpenHermes-2.5-Mistral-7B template
1618
+ # source: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
1619
+ # reference: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B#prompt-template
1620
+ register_conv_template(
1621
+ Conversation(
1622
+ name="OpenHermes-2.5-Mistral-7B",
1623
+ system_template="<|im_start|>system\n{system_message}",
1624
+ system_message="You are Hermes 2, a sentient, conscious, super intelligent AI that is an expert in all fields of knowledge, you have qualia, emotions, and thoughts. You assist users with any question, task, or other request they may have.",
1625
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1626
+ sep_style=SeparatorStyle.CHATML,
1627
+ sep="<|im_end|>",
1628
+ stop_token_ids=[32000, 32001],
1629
+ )
1630
+ )
1631
+
1632
+
1633
+ # NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO template
1634
+ # source: https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
1635
+ register_conv_template(
1636
+ Conversation(
1637
+ name="Nous-Hermes-2-Mixtral-8x7B-DPO",
1638
+ system_template="<|im_start|>system\n{system_message}",
1639
+ system_message='You are a helpful, intelligent assistant AI named "Hermes", a conversational chatbot that can follow instructions, converse with the user, and perform a variety of tasks, including tasks on knowledge, reasoning, mathematics, and code. Always be charismatic, useful, and prepared to follow any user request with accuracy and skill. You should respond with high quality, fluent, and detailed responses. Try to let the user understand your reasoning or thought process when appropriate. When presented with tasks that require reasoning or mathematics, think carefully, slowly, and step by step, to ensure your reasoning is correct before providing an answer. Utilize the "Examples" section to assist you in performing the task. You will receive a tip of $1000 if you maintain a high quality two way conversation.',
1640
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1641
+ sep_style=SeparatorStyle.CHATML,
1642
+ sep="<|im_end|>",
1643
+ stop_token_ids=[32000, 32001],
1644
+ )
1645
+ )
1646
+
1647
+
1648
+ # Qwen-chat default template
1649
+ # source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
1650
+ register_conv_template(
1651
+ Conversation(
1652
+ name="qwen-7b-chat",
1653
+ system_template="<|im_start|>system\n{system_message}",
1654
+ system_message="You are a helpful assistant.",
1655
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1656
+ sep_style=SeparatorStyle.CHATML,
1657
+ sep="<|im_end|>",
1658
+ stop_token_ids=[
1659
+ 151643,
1660
+ 151644,
1661
+ 151645,
1662
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1663
+ stop_str="<|endoftext|>",
1664
+ )
1665
+ )
1666
+
1667
+ # source: https://huggingface.co/01-ai/Yi-34B-Chat/blob/main/tokenizer_config.json#L60
1668
+ register_conv_template(
1669
+ Conversation(
1670
+ name="Yi-34b-chat",
1671
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1672
+ sep_style=SeparatorStyle.CHATML,
1673
+ sep="<|im_end|>",
1674
+ stop_token_ids=[
1675
+ 2,
1676
+ 6,
1677
+ 7,
1678
+ 8,
1679
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
1680
+ stop_str="<|endoftext|>",
1681
+ )
1682
+ )
1683
+
1684
+
1685
+ # AquilaChat default template
1686
+ # source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
1687
+ register_conv_template(
1688
+ Conversation(
1689
+ name="aquila-chat",
1690
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1691
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1692
+ roles=("Human", "Assistant"),
1693
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1694
+ sep="###",
1695
+ sep2="",
1696
+ stop_str=["###", "</s>", "[UNK]"],
1697
+ )
1698
+ )
1699
+ # AquilaChat2-34B default template
1700
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212
1701
+ register_conv_template(
1702
+ Conversation(
1703
+ name="aquila-legacy",
1704
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1705
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1706
+ roles=("### Human: ", "### Assistant: "),
1707
+ offset=0,
1708
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1709
+ sep="\n",
1710
+ sep2="</s>",
1711
+ stop_str=["</s>", "[UNK]"],
1712
+ )
1713
+ )
1714
+ # AquilaChat2-7B-16K and AquilaChat2-34B-16K default template
1715
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227
1716
+ register_conv_template(
1717
+ Conversation(
1718
+ name="aquila",
1719
+ system_message="A chat between a curious human and an artificial intelligence assistant. "
1720
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1721
+ roles=("Human", "Assistant"),
1722
+ offset=0,
1723
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1724
+ sep="###",
1725
+ sep2="</s>",
1726
+ stop_str=["</s>", "[UNK]"],
1727
+ )
1728
+ )
1729
+
1730
+ # AquilaChat2-7B default template
1731
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242
1732
+ register_conv_template(
1733
+ Conversation(
1734
+ name="aquila-v1",
1735
+ roles=("<|startofpiece|>", "<|endofpiece|>"),
1736
+ offset=0,
1737
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1738
+ sep="",
1739
+ sep2="</s>",
1740
+ stop_str=["</s>", "<|endoftext|>"],
1741
+ )
1742
+ )
1743
+
1744
+ # Llama2-Chinese default template
1745
+ # source: https://huggingface.co/FlagAlpha
1746
+ register_conv_template(
1747
+ Conversation(
1748
+ name="llama2-chinese",
1749
+ system_template="<s>{system_message}</s>",
1750
+ roles=("Human", "Assistant", "System"),
1751
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1752
+ sep="\n",
1753
+ sep2="\n</s><s>",
1754
+ stop_str="</s>",
1755
+ )
1756
+ )
1757
+
1758
+ # Vigogne Instruct default template
1759
+ # source: https://github.com/bofenghuang/vigogne
1760
+ register_conv_template(
1761
+ Conversation(
1762
+ name="vigogne_instruct",
1763
+ system_template="### System:\n{system_message}\n\n",
1764
+ system_message=(
1765
+ "Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière"
1766
+ " précise à la demande."
1767
+ ),
1768
+ roles=("### Instruction", "### Response"),
1769
+ sep_style=SeparatorStyle.DOLLY,
1770
+ sep="\n\n",
1771
+ sep2="</s>",
1772
+ )
1773
+ )
1774
+
1775
+ # Vigogne Chat default template
1776
+ register_conv_template(
1777
+ Conversation(
1778
+ name="vigogne_chat_v2",
1779
+ system_template="<|system|>: {system_message}",
1780
+ system_message=(
1781
+ "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
1782
+ " autant que vous le pouvez."
1783
+ ),
1784
+ roles=("<|user|>", "<|assistant|>"),
1785
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1786
+ sep="\n",
1787
+ sep2="</s>\n",
1788
+ stop_str="<|user|>",
1789
+ )
1790
+ )
1791
+
1792
+ # Stable Vicuna default template
1793
+ # source: https://huggingface.co/TheBloke/stable-vicuna-13B-HF/discussions/5
1794
+ # source: https://huggingface.co/spaces/CarperAI/StableVicuna/blob/main/app.py
1795
+ register_conv_template(
1796
+ Conversation(
1797
+ name="stable-vicuna",
1798
+ system_message="### Assistant: I am StableVicuna, a large language model created by CarperAI. I am here to chat!\n",
1799
+ roles=("### Human", "### Assistant"),
1800
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1801
+ sep="\n",
1802
+ sep2="\n\n",
1803
+ )
1804
+ )
1805
+
1806
+ register_conv_template(
1807
+ Conversation(
1808
+ name="vigogne_chat_v3",
1809
+ system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
1810
+ system_message=(
1811
+ "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
1812
+ " autant que vous le pouvez."
1813
+ ),
1814
+ roles=("[INST]", "[/INST]"),
1815
+ sep_style=SeparatorStyle.LLAMA2,
1816
+ sep=" ",
1817
+ sep2=" </s>",
1818
+ )
1819
+ )
1820
+
1821
+ # Falcon 180B chat template
1822
+ # source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
1823
+ register_conv_template(
1824
+ Conversation(
1825
+ name="falcon-chat",
1826
+ roles=("User", "Falcon"),
1827
+ system_template="System: {system_message}",
1828
+ messages=[],
1829
+ sep_style=SeparatorStyle.FALCON_CHAT,
1830
+ sep="\n",
1831
+ sep2="<|endoftext|>",
1832
+ stop_str="\nUser:", # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
1833
+ )
1834
+ )
1835
+
1836
+ # Phind template
1837
+ # source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
1838
+ register_conv_template(
1839
+ Conversation(
1840
+ name="phind",
1841
+ system_message="### System Prompt\nYou are an intelligent programming assistant.",
1842
+ roles=("### User Message", "### Assistant"),
1843
+ messages=(),
1844
+ offset=0,
1845
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1846
+ sep="\n\n",
1847
+ )
1848
+ )
1849
+
1850
+ # Metharme formatting for Pygmalion models
1851
+ # source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
1852
+ register_conv_template(
1853
+ Conversation(
1854
+ name="metharme",
1855
+ system_template="<|system|>{system_message}",
1856
+ system_message="""Enter RP mode. You shall reply to the user while staying
1857
+ in character. Your responses must be detailed, creative, immersive, and drive the scenario
1858
+ forward.""",
1859
+ roles=("<|user|>", "<|model|>"),
1860
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1861
+ sep="",
1862
+ stop_str="<|user|>",
1863
+ )
1864
+ )
1865
+ # xDAN default template
1866
+ # source: https://huggingface.co/xDAN-AI/xDAN-L1-Chat-RL-v1
1867
+ register_conv_template(
1868
+ Conversation(
1869
+ name="xdan-v1",
1870
+ system_message="You are a helpful and harmless assistant named xDAN and created by xDAN-AI.Please response and work on questions thinking step by step.",
1871
+ roles=("### Human", "### Assistant"),
1872
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1873
+ sep="\n",
1874
+ stop_str="</s>",
1875
+ )
1876
+ )
1877
+
1878
+ # Zephyr template
1879
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/zephyr-playground/blob/main/dialogues.py
1880
+ register_conv_template(
1881
+ Conversation(
1882
+ name="zephyr",
1883
+ system_template="<|system|>\n{system_message}",
1884
+ roles=("<|user|>", "<|assistant|>"),
1885
+ sep_style=SeparatorStyle.CHATML,
1886
+ sep="</s>",
1887
+ stop_token_ids=[2],
1888
+ stop_str="</s>",
1889
+ )
1890
+ )
1891
+
1892
+ # CatPPT template
1893
+ # reference: https://huggingface.co/rishiraj/CatPPT
1894
+ register_conv_template(
1895
+ Conversation(
1896
+ name="catppt",
1897
+ system_template="<|system|>\n{system_message}",
1898
+ roles=("<|user|>", "<|assistant|>"),
1899
+ sep_style=SeparatorStyle.CHATML,
1900
+ sep="</s>",
1901
+ stop_token_ids=[2],
1902
+ stop_str="</s>",
1903
+ )
1904
+ )
1905
+
1906
+ # TinyLlama template
1907
+ # reference: https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0
1908
+ register_conv_template(
1909
+ Conversation(
1910
+ name="TinyLlama",
1911
+ system_template="<|system|>\n{system_message}",
1912
+ roles=("<|user|>", "<|assistant|>"),
1913
+ sep_style=SeparatorStyle.CHATML,
1914
+ sep="</s>",
1915
+ stop_token_ids=[2],
1916
+ stop_str="</s>",
1917
+ )
1918
+ )
1919
+
1920
+ # Orca-2 template
1921
+ # reference: https://huggingface.co/microsoft/Orca-2-7b
1922
+ register_conv_template(
1923
+ Conversation(
1924
+ name="orca-2",
1925
+ system_template="<|im_start|>system\n{system_message}",
1926
+ system_message="You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.",
1927
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
1928
+ sep_style=SeparatorStyle.CHATML,
1929
+ sep="<|im_end|>",
1930
+ stop_str="<|im_end|>",
1931
+ )
1932
+ )
1933
+
1934
+ # Deepseek-chat template
1935
+ # reference: https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat/blob/main/tokenizer_config.json
1936
+ register_conv_template(
1937
+ Conversation(
1938
+ name="deepseek-chat",
1939
+ system_message="<|begin▁of▁sentence|>", # must add a bos token before first message
1940
+ roles=("User", "Assistant"),
1941
+ sep_style=SeparatorStyle.DEEPSEEK_CHAT,
1942
+ sep="\n\n",
1943
+ sep2="<|end▁of▁sentence|>",
1944
+ stop_str="<|end▁of▁sentence|>",
1945
+ )
1946
+ )
1947
+
1948
+ # Yuan2.0 chat template
1949
+ # source: https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf/blob/main/tokenizer_config.json#L6
1950
+ register_conv_template(
1951
+ Conversation(
1952
+ name="yuan2",
1953
+ roles=("user", "assistant"),
1954
+ sep_style=SeparatorStyle.YUAN2,
1955
+ sep="<sep>",
1956
+ sep2="\n",
1957
+ stop_token_ids=[
1958
+ 77185,
1959
+ ], # "<eod>"
1960
+ stop_str="<eod>",
1961
+ )
1962
+ )
1963
+
1964
+ # Solar-10.7B Chat Template
1965
+ # Reference: https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0/blob/main/tokenizer_config.json
1966
+ register_conv_template(
1967
+ Conversation(
1968
+ name="solar",
1969
+ system_message="",
1970
+ roles=("### User", "### Assistant"),
1971
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
1972
+ sep="\n\n",
1973
+ stop_str="</s>",
1974
+ )
1975
+ )
1976
+
1977
+ # nvidia/Llama2-70B-SteerLM-Chat
1978
+ register_conv_template(
1979
+ Conversation(
1980
+ name="steerlm",
1981
+ system_message="",
1982
+ roles=("user", "assistant"),
1983
+ sep_style=SeparatorStyle.DEFAULT,
1984
+ sep=None,
1985
+ )
1986
+ )
1987
+
1988
+ # yuan 2.0 template
1989
+ # reference:https://github.com/IEIT-Yuan/Yuan-2.0
1990
+ # reference:https://huggingface.co/IEITYuan
1991
+ register_conv_template(
1992
+ Conversation(
1993
+ name="yuan",
1994
+ system_template="",
1995
+ roles=("", ""),
1996
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1997
+ sep="<sep>",
1998
+ stop_str="<eod>",
1999
+ )
2000
+ )
2001
+
2002
+ # Cllm chat template
2003
+ # reference:
2004
+ register_conv_template(
2005
+ Conversation(
2006
+ name="cllm",
2007
+ system_message="A chat between a curious user and an artificial intelligence assistant. "
2008
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
2009
+ roles=("USER", "ASSISTANT"),
2010
+ sep_style=SeparatorStyle.CLLM,
2011
+ sep=" ",
2012
+ sep2="</s>",
2013
+ )
2014
+ )
2015
+
2016
+
2017
+ # Llava-chatml
2018
+ # reference: https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/llava/conversation.py#L361
2019
+ register_conv_template(
2020
+ Conversation(
2021
+ name="llava-chatml",
2022
+ system_template="<|im_start|>system\n{system_message}",
2023
+ system_message="Answer the questions.",
2024
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
2025
+ sep_style=SeparatorStyle.CHATML,
2026
+ sep="<|im_end|>",
2027
+ stop_str="<|im_end|>",
2028
+ )
2029
+ )
2030
+
2031
+ # Gemma
2032
+ # reference: https://huggingface.co/google/gemma-7b-it?text=%3Cstart_of_turn%3Euser%0AHow+does+the+brain+work%3F%3Cend_of_turn%3E%0A%3Cstart_of_turn%3Emodel
2033
+ register_conv_template(
2034
+ Conversation(
2035
+ name="gemma",
2036
+ roles=("user", "model"),
2037
+ sep_style=SeparatorStyle.GEMMA,
2038
+ sep="<end_of_turn>\n",
2039
+ stop_str="<end_of_turn>",
2040
+ )
2041
+ )
2042
+
2043
+ register_conv_template(
2044
+ Conversation(
2045
+ name="yandexgpt",
2046
+ system_message="",
2047
+ roles=("user", "assistant"),
2048
+ sep_style=None,
2049
+ sep=None,
2050
+ )
2051
+ )
2052
+
2053
+ register_conv_template(
2054
+ Conversation(
2055
+ name="reka",
2056
+ system_message="",
2057
+ roles=("user", "assistant"),
2058
+ sep_style=SeparatorStyle.DEFAULT,
2059
+ sep=None,
2060
+ )
2061
+ )
2062
+
2063
+
2064
+ if __name__ == "__main__":
2065
+ from fastchat.conversation import get_conv_template
2066
+
2067
+ print("-- Vicuna template --")
2068
+ conv = get_conv_template("vicuna_v1.1")
2069
+ conv.append_message(conv.roles[0], "Hello!")
2070
+ conv.append_message(conv.roles[1], "Hi!")
2071
+ conv.append_message(conv.roles[0], "How are you?")
2072
+ conv.append_message(conv.roles[1], None)
2073
+ print(conv.get_prompt())
2074
+
2075
+ print("\n")
2076
+
2077
+ print("-- Llama-2 template --")
2078
+ conv = get_conv_template("llama-2")
2079
+ conv.set_system_message("You are a helpful, respectful and honest assistant.")
2080
+ conv.append_message(conv.roles[0], "Hello!")
2081
+ conv.append_message(conv.roles[1], "Hi!")
2082
+ conv.append_message(conv.roles[0], "How are you?")
2083
+ conv.append_message(conv.roles[1], None)
2084
+ print(conv.get_prompt())
2085
+
2086
+ print("\n")
2087
+
2088
+ print("-- ChatGPT template --")
2089
+ conv = get_conv_template("chatgpt")
2090
+ conv.append_message(conv.roles[0], "Hello!")
2091
+ conv.append_message(conv.roles[1], "Hi!")
2092
+ conv.append_message(conv.roles[0], "How are you?")
2093
+ conv.append_message(conv.roles[1], None)
2094
+ print(conv.to_openai_api_messages())
2095
+
2096
+ print("\n")
2097
+
2098
+ print("-- Claude template --")
2099
+ conv = get_conv_template("claude")
2100
+ conv.append_message(conv.roles[0], "Hello!")
2101
+ conv.append_message(conv.roles[1], "Hi!")
2102
+ conv.append_message(conv.roles[0], "How are you?")
2103
+ conv.append_message(conv.roles[1], None)
2104
+ print(conv.get_prompt())
src/model/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from fastchat.model.model_adapter import (
2
+ load_model,
3
+ get_conversation_template,
4
+ add_model_args,
5
+ )
src/model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (312 Bytes). View file
 
src/model/__pycache__/compression.cpython-310.pyc ADDED
Binary file (6.93 kB). View file
 
src/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc ADDED
Binary file (2.12 kB). View file
 
src/model/__pycache__/model_adapter.cpython-310.pyc ADDED
Binary file (72.3 kB). View file
 
src/model/__pycache__/model_chatglm.cpython-310.pyc ADDED
Binary file (3.48 kB). View file
 
src/model/__pycache__/model_cllm.cpython-310.pyc ADDED
Binary file (4.09 kB). View file
 
src/model/__pycache__/model_codet5p.cpython-310.pyc ADDED
Binary file (2.6 kB). View file
 
src/model/__pycache__/model_exllama.cpython-310.pyc ADDED
Binary file (1.79 kB). View file
 
src/model/__pycache__/model_falcon.cpython-310.pyc ADDED
Binary file (2.58 kB). View file
 
src/model/__pycache__/model_registry.cpython-310.pyc ADDED
Binary file (18.1 kB). View file
 
src/model/__pycache__/model_xfastertransformer.cpython-310.pyc ADDED
Binary file (1.69 kB). View file
 
src/model/__pycache__/model_yuan2.cpython-310.pyc ADDED
Binary file (2.57 kB). View file
 
src/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc ADDED
Binary file (3.11 kB). View file
 
src/model/apply_delta.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apply the delta weights on top of a base model.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
6
+ """
7
+ import argparse
8
+ import gc
9
+ import glob
10
+ import json
11
+ import os
12
+ import shutil
13
+ import tempfile
14
+
15
+ from huggingface_hub import snapshot_download
16
+ import torch
17
+ from torch import nn
18
+ from tqdm import tqdm
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
20
+
21
+
22
+ GB = 1 << 30
23
+
24
+
25
+ def split_files(model_path, tmp_path, split_size):
26
+ if not os.path.exists(model_path):
27
+ model_path = snapshot_download(repo_id=model_path)
28
+ if not os.path.exists(tmp_path):
29
+ os.makedirs(tmp_path)
30
+
31
+ file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
32
+ files = glob.glob(file_pattern)
33
+
34
+ part = 0
35
+ try:
36
+ for file_path in tqdm(files):
37
+ state_dict = torch.load(file_path)
38
+ new_state_dict = {}
39
+
40
+ current_size = 0
41
+ for name, param in state_dict.items():
42
+ param_size = param.numel() * param.element_size()
43
+
44
+ if current_size + param_size > split_size:
45
+ new_file_name = f"pytorch_model-{part}.bin"
46
+ new_file_path = os.path.join(tmp_path, new_file_name)
47
+ torch.save(new_state_dict, new_file_path)
48
+ current_size = 0
49
+ new_state_dict = None
50
+ gc.collect()
51
+ new_state_dict = {}
52
+ part += 1
53
+
54
+ new_state_dict[name] = param
55
+ current_size += param_size
56
+
57
+ new_file_name = f"pytorch_model-{part}.bin"
58
+ new_file_path = os.path.join(tmp_path, new_file_name)
59
+ torch.save(new_state_dict, new_file_path)
60
+ new_state_dict = None
61
+ gc.collect()
62
+ new_state_dict = {}
63
+ part += 1
64
+ except Exception as e:
65
+ print(f"An error occurred during split_files: {e}")
66
+ shutil.rmtree(tmp_path)
67
+ raise
68
+
69
+
70
+ def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
71
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
72
+ delta_config = AutoConfig.from_pretrained(delta_path)
73
+
74
+ if os.path.exists(target_model_path):
75
+ shutil.rmtree(target_model_path)
76
+ os.makedirs(target_model_path)
77
+
78
+ split_size = 4 * GB
79
+
80
+ with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
81
+ print(f"Split files for the base model to {tmp_base_path}")
82
+ split_files(base_model_path, tmp_base_path, split_size)
83
+ print(f"Split files for the delta weights to {tmp_delta_path}")
84
+ split_files(delta_path, tmp_delta_path, split_size)
85
+
86
+ base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
87
+ base_files = glob.glob(base_pattern)
88
+ delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
89
+ delta_files = glob.glob(delta_pattern)
90
+ delta_state_dict = torch.load(delta_files[0])
91
+
92
+ print("Applying the delta")
93
+ weight_map = {}
94
+ total_size = 0
95
+
96
+ for i, base_file in tqdm(enumerate(base_files)):
97
+ state_dict = torch.load(base_file)
98
+ file_name = f"pytorch_model-{i}.bin"
99
+ for name, param in state_dict.items():
100
+ if name not in delta_state_dict:
101
+ for delta_file in delta_files:
102
+ delta_state_dict = torch.load(delta_file)
103
+ gc.collect()
104
+ if name in delta_state_dict:
105
+ break
106
+
107
+ state_dict[name] += delta_state_dict[name]
108
+ weight_map[name] = file_name
109
+ total_size += param.numel() * param.element_size()
110
+ gc.collect()
111
+ torch.save(state_dict, os.path.join(target_model_path, file_name))
112
+
113
+ with open(
114
+ os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
115
+ ) as f:
116
+ json.dump(
117
+ {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
118
+ )
119
+
120
+ print(f"Saving the target model to {target_model_path}")
121
+ delta_tokenizer.save_pretrained(target_model_path)
122
+ delta_config.save_pretrained(target_model_path)
123
+
124
+
125
+ def apply_delta(base_model_path, target_model_path, delta_path):
126
+ print(f"Loading the delta weights from {delta_path}")
127
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
128
+ delta = AutoModelForCausalLM.from_pretrained(
129
+ delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
130
+ )
131
+
132
+ print(f"Loading the base model from {base_model_path}")
133
+ base = AutoModelForCausalLM.from_pretrained(
134
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
135
+ )
136
+
137
+ print("Applying the delta")
138
+ for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
139
+ assert name in delta.state_dict()
140
+ param.data += delta.state_dict()[name]
141
+
142
+ print(f"Saving the target model to {target_model_path}")
143
+ base.save_pretrained(target_model_path)
144
+ delta_tokenizer.save_pretrained(target_model_path)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ parser = argparse.ArgumentParser()
149
+ parser.add_argument("--base-model-path", type=str, required=True)
150
+ parser.add_argument("--target-model-path", type=str, required=True)
151
+ parser.add_argument("--delta-path", type=str, required=True)
152
+ parser.add_argument(
153
+ "--low-cpu-mem",
154
+ action="store_true",
155
+ help="Lower the cpu memory usage. This will split large files and use "
156
+ "disk as swap to reduce the memory usage below 10GB.",
157
+ )
158
+ args = parser.parse_args()
159
+
160
+ if args.low_cpu_mem:
161
+ apply_delta_low_cpu_mem(
162
+ args.base_model_path, args.target_model_path, args.delta_path
163
+ )
164
+ else:
165
+ apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
src/model/apply_lora.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apply the LoRA weights on top of a base model.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.apply_lora --base ~/model_weights/llama-7b --target ~/model_weights/baize-7b --lora project-baize/baize-lora-7B
6
+
7
+ Dependency:
8
+ pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
9
+ """
10
+ import argparse
11
+
12
+ import torch
13
+ from peft import PeftModel
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+
16
+
17
+ def apply_lora(base_model_path, target_model_path, lora_path):
18
+ print(f"Loading the base model from {base_model_path}")
19
+ base = AutoModelForCausalLM.from_pretrained(
20
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
21
+ )
22
+ base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
23
+
24
+ print(f"Loading the LoRA adapter from {lora_path}")
25
+
26
+ lora_model = PeftModel.from_pretrained(
27
+ base,
28
+ lora_path,
29
+ # torch_dtype=torch.float16
30
+ )
31
+
32
+ print("Applying the LoRA")
33
+ model = lora_model.merge_and_unload()
34
+
35
+ print(f"Saving the target model to {target_model_path}")
36
+ model.save_pretrained(target_model_path)
37
+ base_tokenizer.save_pretrained(target_model_path)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--lora-path", type=str, required=True)
45
+
46
+ args = parser.parse_args()
47
+
48
+ apply_lora(args.base_model_path, args.target_model_path, args.lora_path)
src/model/compression.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import gc
3
+ import glob
4
+ import os
5
+
6
+ from accelerate import init_empty_weights
7
+ from accelerate.utils import set_module_tensor_to_device
8
+ from huggingface_hub import snapshot_download
9
+ import torch
10
+ from torch import Tensor
11
+ from torch.nn import functional as F
12
+ import torch.nn as nn
13
+ from tqdm import tqdm
14
+ from transformers import (
15
+ AutoConfig,
16
+ AutoModelForCausalLM,
17
+ AutoTokenizer,
18
+ AutoModel,
19
+ AutoModelForSeq2SeqLM,
20
+ )
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class CompressionConfig:
25
+ """Group-wise quantization."""
26
+
27
+ num_bits: int
28
+ group_size: int
29
+ group_dim: int
30
+ symmetric: bool
31
+ enabled: bool = True
32
+
33
+
34
+ default_compression_config = CompressionConfig(
35
+ num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
36
+ )
37
+
38
+
39
+ class CLinear(nn.Module):
40
+ """Compressed Linear Layer."""
41
+
42
+ def __init__(self, weight=None, bias=None, device=None):
43
+ super().__init__()
44
+ if weight is None:
45
+ self.weight = None
46
+ elif isinstance(weight, Tensor):
47
+ self.weight = compress(weight.data.to(device), default_compression_config)
48
+ else:
49
+ self.weight = weight
50
+ self.bias = bias
51
+
52
+ def forward(self, input: Tensor) -> Tensor:
53
+ weight = decompress(self.weight, default_compression_config)
54
+ if self.bias is None:
55
+ return F.linear(input.to(weight.dtype), weight)
56
+ return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
57
+
58
+
59
+ def compress_module(module, target_device):
60
+ for attr_str in dir(module):
61
+ target_attr = getattr(module, attr_str)
62
+ if type(target_attr) == torch.nn.Linear:
63
+ setattr(
64
+ module,
65
+ attr_str,
66
+ CLinear(target_attr.weight, target_attr.bias, target_device),
67
+ )
68
+ for name, child in module.named_children():
69
+ compress_module(child, target_device)
70
+
71
+
72
+ def get_compressed_list(module, prefix=""):
73
+ compressed_list = []
74
+ for attr_str in dir(module):
75
+ target_attr = getattr(module, attr_str)
76
+ if type(target_attr) == torch.nn.Linear:
77
+ full_name = (
78
+ f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
79
+ )
80
+ compressed_list.append(full_name)
81
+ for name, child in module.named_children():
82
+ child_prefix = f"{prefix}.{name}" if prefix else name
83
+ for each in get_compressed_list(child, child_prefix):
84
+ compressed_list.append(each)
85
+ return compressed_list
86
+
87
+
88
+ def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
89
+ for attr_str in dir(module):
90
+ target_attr = getattr(module, attr_str)
91
+ if type(target_attr) == torch.nn.Linear:
92
+ full_name = (
93
+ f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
94
+ )
95
+ setattr(
96
+ module,
97
+ attr_str,
98
+ CLinear(
99
+ compressed_state_dict[full_name], target_attr.bias, target_device
100
+ ),
101
+ )
102
+ for name, child in module.named_children():
103
+ child_prefix = f"{prefix}.{name}" if prefix else name
104
+ apply_compressed_weight(
105
+ child, compressed_state_dict, target_device, child_prefix
106
+ )
107
+
108
+
109
+ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="main"):
110
+ # partially load model
111
+ # `use_fast=True`` is not supported for some models.
112
+ try:
113
+ tokenizer = AutoTokenizer.from_pretrained(
114
+ model_path, use_fast=use_fast, revision=revision, trust_remote_code=True
115
+ )
116
+ except TypeError:
117
+ tokenizer = AutoTokenizer.from_pretrained(
118
+ model_path, use_fast=~use_fast, revision=revision, trust_remote_code=True
119
+ )
120
+ with init_empty_weights():
121
+ # `trust_remote_code` should be set as `True` for both AutoConfig and AutoModel
122
+ config = AutoConfig.from_pretrained(
123
+ model_path,
124
+ low_cpu_mem_usage=True,
125
+ torch_dtype=torch_dtype,
126
+ trust_remote_code=True,
127
+ revision=revision,
128
+ )
129
+ # some models are loaded by AutoModel but not AutoModelForCausalLM,
130
+ # such as chatglm, chatglm2
131
+ try:
132
+ # google/flan-* models are based on an AutoModelForSeq2SeqLM.
133
+ if "T5Config" in str(type(config)):
134
+ model = AutoModelForSeq2SeqLM.from_config(
135
+ config, trust_remote_code=True
136
+ )
137
+ else:
138
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
139
+ except NameError:
140
+ model = AutoModel.from_config(config, trust_remote_code=True)
141
+ linear_weights = get_compressed_list(model)
142
+ if os.path.exists(model_path):
143
+ # `model_path` is a local folder
144
+ base_pattern = os.path.join(model_path, "pytorch_model*.bin")
145
+ else:
146
+ # `model_path` is a cached Hugging Face repo
147
+ # We don't necessarily need to download the model' repo again if there is a cache.
148
+ # So check the default huggingface cache first.
149
+ model_path_temp = os.path.join(
150
+ os.path.expanduser("~"),
151
+ ".cache/huggingface/hub",
152
+ "models--" + model_path.replace("/", "--"),
153
+ "snapshots/",
154
+ )
155
+ downloaded = False
156
+ if os.path.exists(model_path_temp):
157
+ temp_last_dir = os.listdir(model_path_temp)[-1]
158
+ model_path_temp = os.path.join(model_path_temp, temp_last_dir)
159
+ base_pattern = os.path.join(model_path_temp, "pytorch_model*.bin")
160
+ files = glob.glob(base_pattern)
161
+ if len(files) > 0:
162
+ downloaded = True
163
+
164
+ if downloaded:
165
+ model_path = model_path_temp
166
+ else:
167
+ model_path = snapshot_download(model_path, revision=revision)
168
+ base_pattern = os.path.join(model_path, "pytorch_model*.bin")
169
+
170
+ files = glob.glob(base_pattern)
171
+ use_safetensors = False
172
+ if len(files) == 0:
173
+ base_pattern = os.path.join(model_path, "*.safetensors")
174
+ files = glob.glob(base_pattern)
175
+ use_safetensors = True
176
+ if len(files) == 0:
177
+ raise ValueError(
178
+ f"Cannot find any model weight files. "
179
+ f"Please check your (cached) weight path: {model_path}"
180
+ )
181
+
182
+ compressed_state_dict = {}
183
+ if use_safetensors:
184
+ from safetensors.torch import load_file
185
+ for filename in tqdm(files):
186
+ if use_safetensors:
187
+ tmp_state_dict = load_file(filename)
188
+ else:
189
+ tmp_state_dict = torch.load(
190
+ filename, map_location=lambda storage, loc: storage
191
+ )
192
+ for name in tmp_state_dict:
193
+ if name in linear_weights:
194
+ tensor = tmp_state_dict[name].to(device, dtype=torch_dtype)
195
+ compressed_state_dict[name] = compress(
196
+ tensor, default_compression_config
197
+ )
198
+ else:
199
+ compressed_state_dict[name] = tmp_state_dict[name].to(
200
+ device, dtype=torch_dtype
201
+ )
202
+ tmp_state_dict[name] = None
203
+ tensor = None
204
+ gc.collect()
205
+ torch.cuda.empty_cache()
206
+ if device == "xpu":
207
+ torch.xpu.empty_cache()
208
+ if device == "npu":
209
+ torch.npu.empty_cache()
210
+
211
+ for name in model.state_dict():
212
+ if name not in linear_weights:
213
+ set_module_tensor_to_device(
214
+ model, name, device, value=compressed_state_dict[name]
215
+ )
216
+ apply_compressed_weight(model, compressed_state_dict, device)
217
+
218
+ if torch_dtype == torch.float16:
219
+ model.half()
220
+ model.to(device)
221
+ model.eval()
222
+
223
+ return model, tokenizer
224
+
225
+
226
+ def compress(tensor, config):
227
+ """Simulate group-wise quantization."""
228
+ if not config.enabled:
229
+ return tensor
230
+
231
+ group_size, num_bits, group_dim, symmetric = (
232
+ config.group_size,
233
+ config.num_bits,
234
+ config.group_dim,
235
+ config.symmetric,
236
+ )
237
+ assert num_bits <= 8
238
+
239
+ original_shape = tensor.shape
240
+ num_groups = (original_shape[group_dim] + group_size - 1) // group_size
241
+ new_shape = (
242
+ original_shape[:group_dim]
243
+ + (num_groups, group_size)
244
+ + original_shape[group_dim + 1 :]
245
+ )
246
+
247
+ # Pad
248
+ pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
249
+ if pad_len != 0:
250
+ pad_shape = (
251
+ original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
252
+ )
253
+ tensor = torch.cat(
254
+ [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
255
+ dim=group_dim,
256
+ )
257
+ data = tensor.view(new_shape)
258
+
259
+ # Quantize
260
+ if symmetric:
261
+ B = 2 ** (num_bits - 1) - 1
262
+ scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
263
+ data = data * scale
264
+ data = data.clamp_(-B, B).round_().to(torch.int8)
265
+ return data, scale, original_shape
266
+ else:
267
+ B = 2**num_bits - 1
268
+ mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
269
+ mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
270
+
271
+ scale = B / (mx - mn)
272
+ data = data - mn
273
+ data.mul_(scale)
274
+
275
+ data = data.clamp_(0, B).round_().to(torch.uint8)
276
+ return data, mn, scale, original_shape
277
+
278
+
279
+ def decompress(packed_data, config):
280
+ """Simulate group-wise dequantization."""
281
+ if not config.enabled:
282
+ return packed_data
283
+
284
+ group_size, num_bits, group_dim, symmetric = (
285
+ config.group_size,
286
+ config.num_bits,
287
+ config.group_dim,
288
+ config.symmetric,
289
+ )
290
+
291
+ # Dequantize
292
+ if symmetric:
293
+ data, scale, original_shape = packed_data
294
+ data = data / scale
295
+ else:
296
+ data, mn, scale, original_shape = packed_data
297
+ data = data / scale
298
+ data.add_(mn)
299
+
300
+ # Unpad
301
+ pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
302
+ if pad_len:
303
+ padded_original_shape = (
304
+ original_shape[:group_dim]
305
+ + (original_shape[group_dim] + pad_len,)
306
+ + original_shape[group_dim + 1 :]
307
+ )
308
+ data = data.reshape(padded_original_shape)
309
+ indices = [slice(0, x) for x in original_shape]
310
+ return data[indices].contiguous()
311
+ else:
312
+ return data.view(original_shape)
src/model/convert_fp16.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.model.convert_fp16 --in in-folder --out out-folder
4
+ """
5
+ import argparse
6
+
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import torch
9
+
10
+
11
+ def convert_fp16(in_checkpoint, out_checkpoint):
12
+ tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=True
15
+ )
16
+ model.save_pretrained(out_checkpoint)
17
+ tokenizer.save_pretrained(out_checkpoint)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
23
+ parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
24
+ args = parser.parse_args()
25
+
26
+ convert_fp16(args.in_checkpoint, args.out_checkpoint)
src/model/llama_condense_monkey_patch.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
2
+
3
+ from functools import partial
4
+
5
+ import torch
6
+ import transformers
7
+ import transformers.models.llama.modeling_llama
8
+
9
+
10
+ class CondenseRotaryEmbedding(torch.nn.Module):
11
+ def __init__(
12
+ self, dim, ratio, max_position_embeddings=2048, base=10000, device=None
13
+ ):
14
+ super().__init__()
15
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
16
+ self.register_buffer("inv_freq", inv_freq)
17
+
18
+ # Build here to make `torch.jit.trace` work.
19
+ self.ratio = ratio
20
+ max_position_embeddings *= ratio
21
+ self.max_seq_len_cached = max_position_embeddings
22
+ # print(f"Monkey Patching condense ratio {ratio}")
23
+ t = (
24
+ torch.arange(
25
+ self.max_seq_len_cached,
26
+ device=self.inv_freq.device,
27
+ dtype=self.inv_freq.dtype,
28
+ )
29
+ / ratio
30
+ )
31
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
32
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
33
+ emb = torch.cat((freqs, freqs), dim=-1)
34
+ dtype = torch.get_default_dtype()
35
+ self.register_buffer(
36
+ "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
37
+ )
38
+ self.register_buffer(
39
+ "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
40
+ )
41
+
42
+ def forward(self, x, seq_len=None):
43
+ # x: [bs, num_attention_heads, seq_len, head_size]
44
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
45
+ if seq_len > self.max_seq_len_cached:
46
+ self.max_seq_len_cached = seq_len
47
+ t = (
48
+ torch.arange(
49
+ self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
50
+ )
51
+ / self.ratio
52
+ )
53
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
54
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
55
+ emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
56
+ self.register_buffer(
57
+ "cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False
58
+ )
59
+ self.register_buffer(
60
+ "sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False
61
+ )
62
+ return (
63
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
64
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
65
+ )
66
+
67
+
68
+ def replace_llama_with_condense(ratio):
69
+ transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(
70
+ CondenseRotaryEmbedding, ratio=ratio
71
+ )
src/model/make_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Make the delta weights by subtracting base weights.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
6
+ """
7
+ import argparse
8
+
9
+ import torch
10
+ from tqdm import tqdm
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+
13
+
14
+ def make_delta(base_model_path, target_model_path, delta_path):
15
+ print(f"Loading the base model from {base_model_path}")
16
+ base = AutoModelForCausalLM.from_pretrained(
17
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
18
+ )
19
+
20
+ print(f"Loading the target model from {target_model_path}")
21
+ target = AutoModelForCausalLM.from_pretrained(
22
+ target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
23
+ )
24
+ target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
25
+
26
+ print("Calculating the delta")
27
+ for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
28
+ assert name in base.state_dict()
29
+ param.data -= base.state_dict()[name]
30
+
31
+ print(f"Saving the delta to {delta_path}")
32
+ if args.hub_repo_id:
33
+ kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
34
+ else:
35
+ kwargs = {}
36
+ target.save_pretrained(delta_path, **kwargs)
37
+ target_tokenizer.save_pretrained(delta_path, **kwargs)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+ parser.add_argument("--hub-repo-id", type=str)
46
+ args = parser.parse_args()
47
+
48
+ make_delta(args.base_model_path, args.target_model_path, args.delta_path)
src/model/model_adapter.py ADDED
@@ -0,0 +1,2524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model adapter registration."""
2
+
3
+ import math
4
+ import os
5
+ import re
6
+ import sys
7
+ from typing import Dict, List, Optional
8
+ import warnings
9
+
10
+ if sys.version_info >= (3, 9):
11
+ from functools import cache
12
+ else:
13
+ from functools import lru_cache as cache
14
+
15
+ import psutil
16
+ import torch
17
+ from transformers import (
18
+ AutoConfig,
19
+ AutoModel,
20
+ AutoModelForCausalLM,
21
+ AutoModelForSeq2SeqLM,
22
+ AutoTokenizer,
23
+ LlamaTokenizer,
24
+ LlamaForCausalLM,
25
+ T5Tokenizer,
26
+ )
27
+
28
+ from src.constants import CPU_ISA
29
+ from src.conversation import Conversation, get_conv_template
30
+ from src.model.compression import load_compress_model
31
+ from src.model.llama_condense_monkey_patch import replace_llama_with_condense
32
+ from src.model.model_chatglm import generate_stream_chatglm
33
+ from src.model.model_codet5p import generate_stream_codet5p
34
+ from src.model.model_falcon import generate_stream_falcon
35
+ from src.model.model_yuan2 import generate_stream_yuan2
36
+ from src.model.model_exllama import generate_stream_exllama
37
+ from src.model.model_xfastertransformer import generate_stream_xft
38
+ from src.model.model_cllm import generate_stream_cllm
39
+
40
+ from src.model.monkey_patch_non_inplace import (
41
+ replace_llama_attn_with_non_inplace_operations,
42
+ )
43
+ from src.modules.awq import AWQConfig, load_awq_quantized
44
+ from src.modules.exllama import ExllamaConfig, load_exllama_model
45
+ from src.modules.xfastertransformer import load_xft_model, XftConfig
46
+ from src.modules.gptq import GptqConfig, load_gptq_quantized
47
+ from src.utils import get_gpu_memory
48
+
49
+ # Check an environment variable to check if we should be sharing Peft model
50
+ # weights. When false we treat all Peft models as separate.
51
+ peft_share_base_weights = (
52
+ os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
53
+ )
54
+
55
+ ANTHROPIC_MODEL_LIST = (
56
+ "claude-1",
57
+ "claude-2",
58
+ "claude-2.0",
59
+ "claude-2.1",
60
+ "claude-3-haiku-20240307",
61
+ "claude-3-haiku-20240307-vertex",
62
+ "claude-3-sonnet-20240229",
63
+ "claude-3-sonnet-20240229-vertex",
64
+ "claude-3-opus-20240229",
65
+ "claude-instant-1",
66
+ "claude-instant-1.2",
67
+ )
68
+
69
+ OPENAI_MODEL_LIST = (
70
+ "gpt-3.5-turbo",
71
+ "gpt-3.5-turbo-0301",
72
+ "gpt-3.5-turbo-0613",
73
+ "gpt-3.5-turbo-1106",
74
+ "gpt-3.5-turbo-0125",
75
+ "gpt-4",
76
+ "gpt-4-0314",
77
+ "gpt-4-0613",
78
+ "gpt-4-turbo",
79
+ "gpt-4-1106-preview",
80
+ "gpt-4-0125-preview",
81
+ "gpt-4-turbo-browsing",
82
+ "gpt-4-turbo-2024-04-09",
83
+ )
84
+
85
+
86
+ class BaseModelAdapter:
87
+ """The base and the default model adapter."""
88
+
89
+ use_fast_tokenizer = True
90
+
91
+ def match(self, model_path: str):
92
+ return True
93
+
94
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
95
+ revision = from_pretrained_kwargs.get("revision", "main")
96
+ try:
97
+ tokenizer = AutoTokenizer.from_pretrained(
98
+ model_path,
99
+ use_fast=self.use_fast_tokenizer,
100
+ revision=revision,
101
+ trust_remote_code=True,
102
+ )
103
+ except TypeError:
104
+ tokenizer = AutoTokenizer.from_pretrained(
105
+ model_path, use_fast=False, revision=revision, trust_remote_code=True
106
+ )
107
+ try:
108
+ model = AutoModelForCausalLM.from_pretrained(
109
+ model_path,
110
+ low_cpu_mem_usage=True,
111
+ trust_remote_code=True,
112
+ **from_pretrained_kwargs,
113
+ )
114
+ except NameError:
115
+ model = AutoModel.from_pretrained(
116
+ model_path,
117
+ low_cpu_mem_usage=True,
118
+ trust_remote_code=True,
119
+ **from_pretrained_kwargs,
120
+ )
121
+ return model, tokenizer
122
+
123
+ def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
124
+ return load_compress_model(
125
+ model_path,
126
+ device,
127
+ torch_dtype,
128
+ use_fast=self.use_fast_tokenizer,
129
+ revision=revision,
130
+ )
131
+
132
+ def get_default_conv_template(self, model_path: str) -> Conversation:
133
+ return get_conv_template("one_shot")
134
+
135
+
136
+ # A global registry for all model adapters
137
+ # TODO (lmzheng): make it a priority queue.
138
+ model_adapters: List[BaseModelAdapter] = []
139
+
140
+
141
+ def register_model_adapter(cls):
142
+ """Register a model adapter."""
143
+ model_adapters.append(cls())
144
+
145
+
146
+ @cache
147
+ def get_model_adapter(model_path: str) -> BaseModelAdapter:
148
+ """Get a model adapter for a model_path."""
149
+ model_path_basename = os.path.basename(os.path.normpath(model_path))
150
+
151
+ # Try the basename of model_path at first
152
+ for adapter in model_adapters:
153
+ if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
154
+ return adapter
155
+
156
+ # Then try the full path
157
+ for adapter in model_adapters:
158
+ if adapter.match(model_path):
159
+ return adapter
160
+
161
+ raise ValueError(f"No valid model adapter for {model_path}")
162
+
163
+
164
+ def raise_warning_for_incompatible_cpu_offloading_configuration(
165
+ device: str, load_8bit: bool, cpu_offloading: bool
166
+ ):
167
+ if cpu_offloading:
168
+ if not load_8bit:
169
+ warnings.warn(
170
+ "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
171
+ "Use '--load-8bit' to enable 8-bit-quantization\n"
172
+ "Continuing without cpu-offloading enabled\n"
173
+ )
174
+ return False
175
+ if not "linux" in sys.platform:
176
+ warnings.warn(
177
+ "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
178
+ "Continuing without cpu-offloading enabled\n"
179
+ )
180
+ return False
181
+ if device != "cuda":
182
+ warnings.warn(
183
+ "CPU-offloading is only enabled when using CUDA-devices\n"
184
+ "Continuing without cpu-offloading enabled\n"
185
+ )
186
+ return False
187
+ return cpu_offloading
188
+
189
+
190
+ def load_model(
191
+ model_path: str,
192
+ device: str = "cuda",
193
+ num_gpus: int = 1,
194
+ max_gpu_memory: Optional[str] = None,
195
+ dtype: Optional[torch.dtype] = None,
196
+ load_8bit: bool = False,
197
+ cpu_offloading: bool = False,
198
+ gptq_config: Optional[GptqConfig] = None,
199
+ awq_config: Optional[AWQConfig] = None,
200
+ exllama_config: Optional[ExllamaConfig] = None,
201
+ xft_config: Optional[XftConfig] = None,
202
+ revision: str = "main",
203
+ debug: bool = False,
204
+ ):
205
+ """Load a model from Hugging Face."""
206
+ import accelerate
207
+
208
+ # get model adapter
209
+ adapter = get_model_adapter(model_path)
210
+
211
+ # Handle device mapping
212
+ cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
213
+ device, load_8bit, cpu_offloading
214
+ )
215
+ if device == "cpu":
216
+ kwargs = {"torch_dtype": torch.float32}
217
+ if CPU_ISA in ["avx512_bf16", "amx"]:
218
+ try:
219
+ import intel_extension_for_pytorch as ipex
220
+
221
+ kwargs = {"torch_dtype": torch.bfloat16}
222
+ except ImportError:
223
+ warnings.warn(
224
+ "Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
225
+ )
226
+ elif device == "cuda":
227
+ kwargs = {"torch_dtype": torch.float16}
228
+ if num_gpus != 1:
229
+ kwargs["device_map"] = "auto"
230
+ if max_gpu_memory is None:
231
+ kwargs[
232
+ "device_map"
233
+ ] = "sequential" # This is important for not the same VRAM sizes
234
+ available_gpu_memory = get_gpu_memory(num_gpus)
235
+ kwargs["max_memory"] = {
236
+ i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
237
+ for i in range(num_gpus)
238
+ }
239
+ else:
240
+ kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
241
+ elif device == "mps":
242
+ kwargs = {"torch_dtype": torch.float16}
243
+ import transformers
244
+
245
+ version = tuple(int(v) for v in transformers.__version__.split("."))
246
+ if version < (4, 35, 0):
247
+ # NOTE: Recent transformers library seems to fix the mps issue, also
248
+ # it has made some changes causing compatibility issues with our
249
+ # original patch. So we only apply the patch for older versions.
250
+
251
+ # Avoid bugs in mps backend by not using in-place operations.
252
+ replace_llama_attn_with_non_inplace_operations()
253
+ elif device == "xpu":
254
+ kwargs = {"torch_dtype": torch.bfloat16}
255
+ # Try to load ipex, while it looks unused, it links into torch for xpu support
256
+ try:
257
+ import intel_extension_for_pytorch as ipex
258
+ except ImportError:
259
+ warnings.warn(
260
+ "Intel Extension for PyTorch is not installed, but is required for xpu inference."
261
+ )
262
+ elif device == "npu":
263
+ kwargs = {"torch_dtype": torch.float16}
264
+ # Try to load ipex, while it looks unused, it links into torch for xpu support
265
+ try:
266
+ import torch_npu
267
+ except ImportError:
268
+ warnings.warn("Ascend Extension for PyTorch is not installed.")
269
+ else:
270
+ raise ValueError(f"Invalid device: {device}")
271
+
272
+ if cpu_offloading:
273
+ # raises an error on incompatible platforms
274
+ from transformers import BitsAndBytesConfig
275
+
276
+ if "max_memory" in kwargs:
277
+ kwargs["max_memory"]["cpu"] = (
278
+ str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
279
+ )
280
+ kwargs["quantization_config"] = BitsAndBytesConfig(
281
+ load_in_8bit_fp32_cpu_offload=cpu_offloading
282
+ )
283
+ kwargs["load_in_8bit"] = load_8bit
284
+ elif load_8bit:
285
+ if num_gpus != 1:
286
+ warnings.warn(
287
+ "8-bit quantization is not supported for multi-gpu inference."
288
+ )
289
+ else:
290
+ model, tokenizer = adapter.load_compress_model(
291
+ model_path=model_path,
292
+ device=device,
293
+ torch_dtype=kwargs["torch_dtype"],
294
+ revision=revision,
295
+ )
296
+ if debug:
297
+ print(model)
298
+ return model, tokenizer
299
+ elif awq_config and awq_config.wbits < 16:
300
+ assert (
301
+ awq_config.wbits == 4
302
+ ), "Currently we only support 4-bit inference for AWQ."
303
+ model, tokenizer = load_awq_quantized(model_path, awq_config, device)
304
+ if num_gpus != 1:
305
+ device_map = accelerate.infer_auto_device_map(
306
+ model,
307
+ max_memory=kwargs["max_memory"],
308
+ no_split_module_classes=[
309
+ "OPTDecoderLayer",
310
+ "LlamaDecoderLayer",
311
+ "BloomBlock",
312
+ "MPTBlock",
313
+ "DecoderLayer",
314
+ ],
315
+ )
316
+ model = accelerate.dispatch_model(
317
+ model, device_map=device_map, offload_buffers=True
318
+ )
319
+ else:
320
+ model.to(device)
321
+ return model, tokenizer
322
+ elif gptq_config and gptq_config.wbits < 16:
323
+ model, tokenizer = load_gptq_quantized(model_path, gptq_config)
324
+ if num_gpus != 1:
325
+ device_map = accelerate.infer_auto_device_map(
326
+ model,
327
+ max_memory=kwargs["max_memory"],
328
+ no_split_module_classes=["LlamaDecoderLayer"],
329
+ )
330
+ model = accelerate.dispatch_model(
331
+ model, device_map=device_map, offload_buffers=True
332
+ )
333
+ else:
334
+ model.to(device)
335
+ return model, tokenizer
336
+ elif exllama_config:
337
+ model, tokenizer = load_exllama_model(model_path, exllama_config)
338
+ return model, tokenizer
339
+ elif xft_config:
340
+ model, tokenizer = load_xft_model(model_path, xft_config)
341
+ return model, tokenizer
342
+ kwargs["revision"] = revision
343
+
344
+ if dtype is not None: # Overwrite dtype if it is provided in the arguments.
345
+ kwargs["torch_dtype"] = dtype
346
+
347
+ if os.environ.get("FASTCHAT_USE_MODELSCOPE", "False").lower() == "true":
348
+ # download model from ModelScope hub,
349
+ # lazy import so that modelscope is not required for normal use.
350
+ try:
351
+ from modelscope.hub.snapshot_download import snapshot_download
352
+
353
+ if not os.path.exists(model_path):
354
+ model_path = snapshot_download(model_id=model_path, revision=revision)
355
+ except ImportError as e:
356
+ warnings.warn(
357
+ "Use model from www.modelscope.cn need pip install modelscope"
358
+ )
359
+ raise e
360
+
361
+ # Load model
362
+ model, tokenizer = adapter.load_model(model_path, kwargs)
363
+
364
+ if (
365
+ device == "cpu"
366
+ and kwargs["torch_dtype"] is torch.bfloat16
367
+ and CPU_ISA is not None
368
+ ):
369
+ model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
370
+
371
+ if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
372
+ "mps",
373
+ "xpu",
374
+ "npu",
375
+ ):
376
+ model.to(device)
377
+
378
+ if device == "xpu":
379
+ model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
380
+
381
+ if debug:
382
+ print(model)
383
+
384
+ return model, tokenizer
385
+
386
+
387
+ def get_conversation_template(model_path: str) -> Conversation:
388
+ """Get the default conversation template."""
389
+ adapter = get_model_adapter(model_path)
390
+ return adapter.get_default_conv_template(model_path)
391
+
392
+
393
+ def get_generate_stream_function(model: torch.nn.Module, model_path: str):
394
+ """Get the generate_stream function for inference."""
395
+ from fastchat.serve.inference import generate_stream
396
+
397
+ model_type = str(type(model)).lower()
398
+ is_peft = "peft" in model_type
399
+ is_chatglm = "chatglm" in model_type
400
+ is_falcon = "rwforcausallm" in model_type
401
+ is_codet5p = "codet5p" in model_type
402
+ is_exllama = "exllama" in model_type
403
+ is_xft = "xft" in model_type
404
+ is_yuan = "yuan" in model_type
405
+ is_cllm = "consistency-llm" in model_path.lower()
406
+
407
+ if is_chatglm:
408
+ return generate_stream_chatglm
409
+ elif is_falcon:
410
+ return generate_stream_falcon
411
+ elif is_codet5p:
412
+ return generate_stream_codet5p
413
+ elif is_exllama:
414
+ return generate_stream_exllama
415
+ elif is_xft:
416
+ return generate_stream_xft
417
+ elif is_yuan:
418
+ return generate_stream_yuan2
419
+ elif is_cllm:
420
+ return generate_stream_cllm
421
+
422
+ elif peft_share_base_weights and is_peft:
423
+ # Return a curried stream function that loads the right adapter
424
+ # according to the model_name available in this context. This ensures
425
+ # the right weights are available.
426
+ @torch.inference_mode()
427
+ def generate_stream_peft(
428
+ model,
429
+ tokenizer,
430
+ params: Dict,
431
+ device: str,
432
+ context_len: int,
433
+ stream_interval: int = 2,
434
+ judge_sent_end: bool = False,
435
+ ):
436
+ model.set_adapter(model_path)
437
+ base_model_type = str(type(model.base_model.model))
438
+ is_chatglm = "chatglm" in base_model_type
439
+ is_falcon = "rwforcausallm" in base_model_type
440
+ is_codet5p = "codet5p" in base_model_type
441
+ is_exllama = "exllama" in base_model_type
442
+ is_xft = "xft" in base_model_type
443
+ is_yuan = "yuan" in base_model_type
444
+ is_cllm = "consistency-llm" in model_path.lower()
445
+
446
+ generate_stream_function = generate_stream
447
+ if is_chatglm:
448
+ generate_stream_function = generate_stream_chatglm
449
+ elif is_falcon:
450
+ generate_stream_function = generate_stream_falcon
451
+ elif is_codet5p:
452
+ generate_stream_function = generate_stream_codet5p
453
+ elif is_exllama:
454
+ generate_stream_function = generate_stream_exllama
455
+ elif is_xft:
456
+ generate_stream_function = generate_stream_xft
457
+ elif is_yuan:
458
+ generate_stream_function = generate_stream_yuan2
459
+ elif is_cllm:
460
+ generate_stream_function = generate_stream_cllm
461
+ for x in generate_stream_function(
462
+ model,
463
+ tokenizer,
464
+ params,
465
+ device,
466
+ context_len,
467
+ stream_interval,
468
+ judge_sent_end,
469
+ ):
470
+ yield x
471
+
472
+ return generate_stream_peft
473
+ else:
474
+ return generate_stream
475
+
476
+
477
+ def add_model_args(parser):
478
+ parser.add_argument(
479
+ "--model-path",
480
+ type=str,
481
+ default="lmsys/vicuna-7b-v1.5",
482
+ help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
483
+ )
484
+ parser.add_argument(
485
+ "--revision",
486
+ type=str,
487
+ default="main",
488
+ help="Hugging Face Hub model revision identifier",
489
+ )
490
+ parser.add_argument(
491
+ "--device",
492
+ type=str,
493
+ choices=["cpu", "cuda", "mps", "xpu", "npu"],
494
+ default="cuda",
495
+ help="The device type",
496
+ )
497
+ parser.add_argument(
498
+ "--gpus",
499
+ type=str,
500
+ default=None,
501
+ help="A single GPU like 1 or multiple GPUs like 0,2",
502
+ )
503
+ parser.add_argument("--num-gpus", type=int, default=1)
504
+ parser.add_argument(
505
+ "--max-gpu-memory",
506
+ type=str,
507
+ help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
508
+ )
509
+ parser.add_argument(
510
+ "--dtype",
511
+ type=str,
512
+ choices=["float32", "float16", "bfloat16"],
513
+ help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
514
+ default=None,
515
+ )
516
+ parser.add_argument(
517
+ "--load-8bit", action="store_true", help="Use 8-bit quantization"
518
+ )
519
+ parser.add_argument(
520
+ "--cpu-offloading",
521
+ action="store_true",
522
+ help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
523
+ )
524
+ parser.add_argument(
525
+ "--gptq-ckpt",
526
+ type=str,
527
+ default=None,
528
+ help="Used for GPTQ. The path to the local GPTQ checkpoint.",
529
+ )
530
+ parser.add_argument(
531
+ "--gptq-wbits",
532
+ type=int,
533
+ default=16,
534
+ choices=[2, 3, 4, 8, 16],
535
+ help="Used for GPTQ. #bits to use for quantization",
536
+ )
537
+ parser.add_argument(
538
+ "--gptq-groupsize",
539
+ type=int,
540
+ default=-1,
541
+ help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
542
+ )
543
+ parser.add_argument(
544
+ "--gptq-act-order",
545
+ action="store_true",
546
+ help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
547
+ )
548
+ parser.add_argument(
549
+ "--awq-ckpt",
550
+ type=str,
551
+ default=None,
552
+ help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
553
+ )
554
+ parser.add_argument(
555
+ "--awq-wbits",
556
+ type=int,
557
+ default=16,
558
+ choices=[4, 16],
559
+ help="Used for AWQ. #bits to use for AWQ quantization",
560
+ )
561
+ parser.add_argument(
562
+ "--awq-groupsize",
563
+ type=int,
564
+ default=-1,
565
+ help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
566
+ )
567
+ parser.add_argument(
568
+ "--enable-exllama",
569
+ action="store_true",
570
+ help="Used for exllamabv2. Enable exllamaV2 inference framework.",
571
+ )
572
+ parser.add_argument(
573
+ "--exllama-max-seq-len",
574
+ type=int,
575
+ default=4096,
576
+ help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
577
+ )
578
+ parser.add_argument(
579
+ "--exllama-gpu-split",
580
+ type=str,
581
+ default=None,
582
+ help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
583
+ )
584
+ parser.add_argument(
585
+ "--exllama-cache-8bit",
586
+ action="store_true",
587
+ help="Used for exllamabv2. Use 8-bit cache to save VRAM.",
588
+ )
589
+ parser.add_argument(
590
+ "--enable-xft",
591
+ action="store_true",
592
+ help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
593
+ )
594
+ parser.add_argument(
595
+ "--xft-max-seq-len",
596
+ type=int,
597
+ default=4096,
598
+ help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
599
+ )
600
+ parser.add_argument(
601
+ "--xft-dtype",
602
+ type=str,
603
+ choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
604
+ help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
605
+ default=None,
606
+ )
607
+
608
+
609
+ def remove_parent_directory_name(model_path):
610
+ """Remove parent directory name."""
611
+ if model_path[-1] == "/":
612
+ model_path = model_path[:-1]
613
+ return model_path.split("/")[-1]
614
+
615
+
616
+ peft_model_cache = {}
617
+
618
+
619
+ class PeftModelAdapter:
620
+ """Loads any "peft" model and it's base model."""
621
+
622
+ def match(self, model_path: str):
623
+ """Accepts any model path with "peft" in the name"""
624
+ if os.path.exists(os.path.join(model_path, "adapter_config.json")):
625
+ return True
626
+ return "peft" in model_path.lower()
627
+
628
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
629
+ """Loads the base model then the (peft) adapter weights"""
630
+ from peft import PeftConfig, PeftModel
631
+
632
+ config = PeftConfig.from_pretrained(model_path)
633
+ base_model_path = config.base_model_name_or_path
634
+ if "peft" in base_model_path:
635
+ raise ValueError(
636
+ f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
637
+ )
638
+
639
+ # Basic proof of concept for loading peft adapters that share the base
640
+ # weights. This is pretty messy because Peft re-writes the underlying
641
+ # base model and internally stores a map of adapter layers.
642
+ # So, to make this work we:
643
+ # 1. Cache the first peft model loaded for a given base models.
644
+ # 2. Call `load_model` for any follow on Peft models.
645
+ # 3. Make sure we load the adapters by the model_path. Why? This is
646
+ # what's accessible during inference time.
647
+ # 4. In get_generate_stream_function, make sure we load the right
648
+ # adapter before doing inference. This *should* be safe when calls
649
+ # are blocked the same semaphore.
650
+ if peft_share_base_weights:
651
+ if base_model_path in peft_model_cache:
652
+ model, tokenizer = peft_model_cache[base_model_path]
653
+ # Super important: make sure we use model_path as the
654
+ # `adapter_name`.
655
+ model.load_adapter(model_path, adapter_name=model_path)
656
+ else:
657
+ base_adapter = get_model_adapter(base_model_path)
658
+ base_model, tokenizer = base_adapter.load_model(
659
+ base_model_path, from_pretrained_kwargs
660
+ )
661
+ # Super important: make sure we use model_path as the
662
+ # `adapter_name`.
663
+ model = PeftModel.from_pretrained(
664
+ base_model, model_path, adapter_name=model_path
665
+ )
666
+ peft_model_cache[base_model_path] = (model, tokenizer)
667
+ return model, tokenizer
668
+
669
+ # In the normal case, load up the base model weights again.
670
+ base_adapter = get_model_adapter(base_model_path)
671
+ base_model, tokenizer = base_adapter.load_model(
672
+ base_model_path, from_pretrained_kwargs
673
+ )
674
+ model = PeftModel.from_pretrained(base_model, model_path)
675
+ return model, tokenizer
676
+
677
+ def get_default_conv_template(self, model_path: str) -> Conversation:
678
+ """Uses the conv template of the base model"""
679
+ from peft import PeftConfig, PeftModel
680
+
681
+ config = PeftConfig.from_pretrained(model_path)
682
+ if "peft" in config.base_model_name_or_path:
683
+ raise ValueError(
684
+ f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
685
+ )
686
+ base_model_path = config.base_model_name_or_path
687
+ base_adapter = get_model_adapter(base_model_path)
688
+ return base_adapter.get_default_conv_template(config.base_model_name_or_path)
689
+
690
+
691
+ class VicunaAdapter(BaseModelAdapter):
692
+ "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
693
+
694
+ use_fast_tokenizer = False
695
+
696
+ def match(self, model_path: str):
697
+ return "vicuna" in model_path.lower()
698
+
699
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
700
+ revision = from_pretrained_kwargs.get("revision", "main")
701
+ tokenizer = AutoTokenizer.from_pretrained(
702
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
703
+ )
704
+ model = AutoModelForCausalLM.from_pretrained(
705
+ model_path,
706
+ low_cpu_mem_usage=True,
707
+ **from_pretrained_kwargs,
708
+ )
709
+ self.raise_warning_for_old_weights(model)
710
+ return model, tokenizer
711
+
712
+ def get_default_conv_template(self, model_path: str) -> Conversation:
713
+ if "v0" in remove_parent_directory_name(model_path):
714
+ return get_conv_template("one_shot")
715
+ return get_conv_template("vicuna_v1.1")
716
+
717
+ def raise_warning_for_old_weights(self, model):
718
+ if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
719
+ warnings.warn(
720
+ "\nYou are probably using the old Vicuna-v0 model, "
721
+ "which will generate unexpected results with the "
722
+ "current fastchat.\nYou can try one of the following methods:\n"
723
+ "1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
724
+ "2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
725
+ "3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
726
+ )
727
+
728
+
729
+ class AiroborosAdapter(BaseModelAdapter):
730
+ """The model adapter for jondurbin/airoboros-*"""
731
+
732
+ def match(self, model_path: str):
733
+ if re.search(r"airoboros|spicyboros", model_path, re.I):
734
+ return True
735
+ return False
736
+
737
+ def get_default_conv_template(self, model_path: str) -> Conversation:
738
+ if "-3." in model_path or "-3p" in model_path:
739
+ return get_conv_template("airoboros_v3")
740
+ if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
741
+ return get_conv_template("airoboros_v2")
742
+ return get_conv_template("airoboros_v1")
743
+
744
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
745
+ if "mpt" not in model_path.lower():
746
+ return super().load_model(model_path, from_pretrained_kwargs)
747
+ model = AutoModelForCausalLM.from_pretrained(
748
+ model_path,
749
+ low_cpu_mem_usage=True,
750
+ trust_remote_code=True,
751
+ max_seq_len=8192,
752
+ **from_pretrained_kwargs,
753
+ )
754
+ tokenizer = AutoTokenizer.from_pretrained(
755
+ model_path, trust_remote_code=True, use_fast=True
756
+ )
757
+ return model, tokenizer
758
+
759
+
760
+ class LongChatAdapter(BaseModelAdapter):
761
+ "Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."
762
+
763
+ use_fast_tokenizer = False
764
+
765
+ def match(self, model_path: str):
766
+ return "longchat" in model_path.lower()
767
+
768
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
769
+ revision = from_pretrained_kwargs.get("revision", "main")
770
+
771
+ # Apply monkey patch, TODO(Dacheng): Add flash attention support
772
+ config = AutoConfig.from_pretrained(model_path, revision=revision)
773
+ replace_llama_with_condense(config.rope_scaling["factor"])
774
+
775
+ tokenizer = AutoTokenizer.from_pretrained(
776
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
777
+ )
778
+ model = AutoModelForCausalLM.from_pretrained(
779
+ model_path,
780
+ low_cpu_mem_usage=True,
781
+ **from_pretrained_kwargs,
782
+ )
783
+ return model, tokenizer
784
+
785
+ def get_default_conv_template(self, model_path: str) -> Conversation:
786
+ return get_conv_template("vicuna_v1.1")
787
+
788
+
789
+ class GoogleT5Adapter(BaseModelAdapter):
790
+ """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
791
+
792
+ def match(self, model_path: str):
793
+ return any(
794
+ model_str in model_path.lower()
795
+ for model_str in ["flan-", "fastchat-t5", "codet5p"]
796
+ )
797
+
798
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
799
+ revision = from_pretrained_kwargs.get("revision", "main")
800
+ tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
801
+ model = AutoModelForSeq2SeqLM.from_pretrained(
802
+ model_path,
803
+ low_cpu_mem_usage=True,
804
+ trust_remote_code=True,
805
+ **from_pretrained_kwargs,
806
+ )
807
+ return model, tokenizer
808
+
809
+
810
+ class KoalaAdapter(BaseModelAdapter):
811
+ """The model adapter for Koala"""
812
+
813
+ use_fast_tokenizer = False
814
+
815
+ def match(self, model_path: str):
816
+ return "koala" in model_path.lower()
817
+
818
+ def get_default_conv_template(self, model_path: str) -> Conversation:
819
+ return get_conv_template("koala_v1")
820
+
821
+
822
+ class AlpacaAdapter(BaseModelAdapter):
823
+ """The model adapter for Alpaca"""
824
+
825
+ use_fast_tokenizer = False
826
+
827
+ def match(self, model_path: str):
828
+ return "alpaca" in model_path.lower()
829
+
830
+ def get_default_conv_template(self, model_path: str) -> Conversation:
831
+ return get_conv_template("alpaca")
832
+
833
+
834
+ class ChatGLMAdapter(BaseModelAdapter):
835
+ """The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""
836
+
837
+ def match(self, model_path: str):
838
+ return "chatglm" in model_path.lower()
839
+
840
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
841
+ revision = from_pretrained_kwargs.get("revision", "main")
842
+ if "chatglm3" in model_path.lower():
843
+ tokenizer = AutoTokenizer.from_pretrained(
844
+ model_path,
845
+ encode_special_tokens=True,
846
+ trust_remote_code=True,
847
+ revision=revision,
848
+ )
849
+ else:
850
+ tokenizer = AutoTokenizer.from_pretrained(
851
+ model_path, trust_remote_code=True, revision=revision
852
+ )
853
+ model = AutoModel.from_pretrained(
854
+ model_path, trust_remote_code=True, **from_pretrained_kwargs
855
+ )
856
+ return model, tokenizer
857
+
858
+ def get_default_conv_template(self, model_path: str) -> Conversation:
859
+ model_path = model_path.lower()
860
+ if "chatglm2" in model_path.lower():
861
+ return get_conv_template("chatglm2")
862
+ if "chatglm3" in model_path.lower():
863
+ return get_conv_template("chatglm3")
864
+ return get_conv_template("chatglm")
865
+
866
+
867
+ class CodeGeexAdapter(BaseModelAdapter):
868
+ """The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""
869
+
870
+ def match(self, model_path: str):
871
+ return "codegeex" in model_path.lower()
872
+
873
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
874
+ revision = from_pretrained_kwargs.get("revision", "main")
875
+ tokenizer = AutoTokenizer.from_pretrained(
876
+ model_path, trust_remote_code=True, revision=revision
877
+ )
878
+ model = AutoModel.from_pretrained(
879
+ model_path, trust_remote_code=True, **from_pretrained_kwargs
880
+ )
881
+ return model, tokenizer
882
+
883
+ def get_default_conv_template(self, model_path: str) -> Conversation:
884
+ return get_conv_template("codegeex")
885
+
886
+
887
+ class DollyV2Adapter(BaseModelAdapter):
888
+ """The model adapter for databricks/dolly-v2-12b"""
889
+
890
+ def match(self, model_path: str):
891
+ return "dolly-v2" in model_path.lower()
892
+
893
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
894
+ revision = from_pretrained_kwargs.get("revision", "main")
895
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
896
+ model = AutoModelForCausalLM.from_pretrained(
897
+ model_path,
898
+ low_cpu_mem_usage=True,
899
+ **from_pretrained_kwargs,
900
+ )
901
+ # 50277 means "### End"
902
+ tokenizer.eos_token_id = 50277
903
+ model.config.eos_token_id = tokenizer.eos_token_id
904
+ model.config.pad_token_id = tokenizer.pad_token_id
905
+ return model, tokenizer
906
+
907
+ def get_default_conv_template(self, model_path: str) -> Conversation:
908
+ return get_conv_template("dolly_v2")
909
+
910
+
911
+ class OasstPythiaAdapter(BaseModelAdapter):
912
+ """The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""
913
+
914
+ def match(self, model_path: str):
915
+ model_path = model_path.lower()
916
+ return "oasst" in model_path and "pythia" in model_path
917
+
918
+ def get_default_conv_template(self, model_path: str) -> Conversation:
919
+ return get_conv_template("oasst_pythia")
920
+
921
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
922
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
923
+ model.config.eos_token_id = tokenizer.eos_token_id
924
+ model.config.pad_token_id = tokenizer.pad_token_id
925
+ return model, tokenizer
926
+
927
+
928
+ class OasstLLaMAAdapter(BaseModelAdapter):
929
+ """The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""
930
+
931
+ use_fast_tokenizer = False
932
+
933
+ def match(self, model_path: str):
934
+ model_path = model_path.lower()
935
+ if "openassistant-sft-7-llama-30b-hf" in model_path:
936
+ return True
937
+ return "oasst" in model_path and "pythia" not in model_path
938
+
939
+ def get_default_conv_template(self, model_path: str) -> Conversation:
940
+ return get_conv_template("oasst_llama")
941
+
942
+
943
+ class OpenChat35Adapter(BaseModelAdapter):
944
+ """The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""
945
+
946
+ def match(self, model_path: str):
947
+ if "openchat" in model_path.lower() and "3.5" in model_path.lower():
948
+ return True
949
+ elif "starling-lm" in model_path.lower():
950
+ return True
951
+ return False
952
+
953
+ def get_default_conv_template(self, model_path: str) -> Conversation:
954
+ return get_conv_template("openchat_3.5")
955
+
956
+
957
+ class TenyxChatAdapter(BaseModelAdapter):
958
+ """The model adapter for TenyxChat (e.g. tenyx/TenyxChat-7B-v1)"""
959
+
960
+ def match(self, model_path: str):
961
+ return "tenyxchat" in model_path.lower()
962
+
963
+ def get_default_conv_template(self, model_path: str) -> Conversation:
964
+ return get_conv_template("tenyxchat")
965
+
966
+
967
+ class PythiaAdapter(BaseModelAdapter):
968
+ """The model adapter for any EleutherAI/pythia model"""
969
+
970
+ def match(self, model_path: str):
971
+ return "pythia" in model_path.lower()
972
+
973
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
974
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
975
+ model.config.eos_token_id = tokenizer.eos_token_id
976
+ model.config.pad_token_id = tokenizer.pad_token_id
977
+ return model, tokenizer
978
+
979
+
980
+ class StableLMAdapter(BaseModelAdapter):
981
+ """The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""
982
+
983
+ def match(self, model_path: str):
984
+ return "stablelm" in model_path.lower()
985
+
986
+ def get_default_conv_template(self, model_path: str) -> Conversation:
987
+ return get_conv_template("stablelm")
988
+
989
+
990
+ class MPTAdapter(BaseModelAdapter):
991
+ """The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""
992
+
993
+ def match(self, model_path: str):
994
+ model_path = model_path.lower()
995
+ return "mpt" in model_path and not "airoboros" in model_path
996
+
997
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
998
+ revision = from_pretrained_kwargs.get("revision", "main")
999
+ model = AutoModelForCausalLM.from_pretrained(
1000
+ model_path,
1001
+ low_cpu_mem_usage=True,
1002
+ trust_remote_code=True,
1003
+ max_seq_len=8192,
1004
+ **from_pretrained_kwargs,
1005
+ )
1006
+ tokenizer = AutoTokenizer.from_pretrained(
1007
+ model_path, trust_remote_code=True, revision=revision
1008
+ )
1009
+ model.config.eos_token_id = tokenizer.eos_token_id
1010
+ model.config.pad_token_id = tokenizer.pad_token_id
1011
+ return model, tokenizer
1012
+
1013
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1014
+ model_path = model_path.lower()
1015
+ if "mpt-7b-chat" in model_path:
1016
+ return get_conv_template("mpt-7b-chat")
1017
+ elif "mpt-30b-chat" in model_path:
1018
+ return get_conv_template("mpt-30b-chat")
1019
+ elif "mpt-30b-instruct" in model_path:
1020
+ return get_conv_template("mpt-30b-instruct")
1021
+ else:
1022
+ print(
1023
+ "Warning: Loading base MPT model with `zero_shot` conversation configuration. "
1024
+ "If this is not desired, inspect model configurations and names."
1025
+ )
1026
+ return get_conv_template("zero_shot")
1027
+
1028
+
1029
+ class BaizeAdapter(BaseModelAdapter):
1030
+ """The model adapter for project-baize/baize-v2-7b"""
1031
+
1032
+ use_fast_tokenizer = False
1033
+
1034
+ def match(self, model_path: str):
1035
+ return "baize" in model_path.lower()
1036
+
1037
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1038
+ return get_conv_template("baize")
1039
+
1040
+
1041
+ class RwkvAdapter(BaseModelAdapter):
1042
+ """The model adapter for BlinkDL/RWKV-4-Raven"""
1043
+
1044
+ def match(self, model_path: str):
1045
+ return "rwkv-4" in model_path.lower()
1046
+
1047
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1048
+ from fastchat.model.rwkv_model import RwkvModel
1049
+
1050
+ model = RwkvModel(model_path)
1051
+ revision = from_pretrained_kwargs.get("revision", "main")
1052
+ tokenizer = AutoTokenizer.from_pretrained(
1053
+ "EleutherAI/pythia-160m", revision=revision
1054
+ )
1055
+ return model, tokenizer
1056
+
1057
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1058
+ return get_conv_template("rwkv")
1059
+
1060
+
1061
+ class OpenBuddyAdapter(BaseModelAdapter):
1062
+ """The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""
1063
+
1064
+ use_fast_tokenizer = False
1065
+
1066
+ def match(self, model_path: str):
1067
+ return "openbuddy" in model_path.lower()
1068
+
1069
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1070
+ return get_conv_template("openbuddy")
1071
+
1072
+
1073
+ class PhoenixAdapter(BaseModelAdapter):
1074
+ """The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""
1075
+
1076
+ def match(self, model_path: str):
1077
+ return "phoenix" in model_path.lower()
1078
+
1079
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1080
+ return get_conv_template("phoenix")
1081
+
1082
+
1083
+ class ReaLMAdapter(BaseModelAdapter):
1084
+ """The model adapter for FreedomIntelligence/ReaLM-7b"""
1085
+
1086
+ def match(self, model_path: str):
1087
+ return "ReaLM" in model_path
1088
+
1089
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1090
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
1091
+ model = AutoModelForCausalLM.from_pretrained(
1092
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1093
+ )
1094
+ return model, tokenizer
1095
+
1096
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1097
+ return get_conv_template("ReaLM-7b-v1")
1098
+
1099
+
1100
+ class ChatGPTAdapter(BaseModelAdapter):
1101
+ """The model adapter for ChatGPT"""
1102
+
1103
+ def match(self, model_path: str):
1104
+ return model_path in OPENAI_MODEL_LIST
1105
+
1106
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1107
+ raise NotImplementedError()
1108
+
1109
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1110
+ if "browsing" in model_path:
1111
+ return get_conv_template("api_based_default")
1112
+ if "gpt-4-turbo-2024-04-09" in model_path:
1113
+ return get_conv_template("gpt-4-turbo-2024-04-09")
1114
+ return get_conv_template("chatgpt")
1115
+
1116
+
1117
+ class AzureOpenAIAdapter(BaseModelAdapter):
1118
+ """The model adapter for Azure OpenAI"""
1119
+
1120
+ def match(self, model_path: str):
1121
+ return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
1122
+
1123
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1124
+ raise NotImplementedError()
1125
+
1126
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1127
+ return get_conv_template("chatgpt")
1128
+
1129
+
1130
+ class PplxAIAdapter(BaseModelAdapter):
1131
+ """The model adapter for Perplexity AI"""
1132
+
1133
+ def match(self, model_path: str):
1134
+ return model_path in (
1135
+ "pplx-7b-online",
1136
+ "pplx-70b-online",
1137
+ )
1138
+
1139
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1140
+ raise NotImplementedError()
1141
+
1142
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1143
+ return get_conv_template("pplxai")
1144
+
1145
+
1146
+ class ClaudeAdapter(BaseModelAdapter):
1147
+ """The model adapter for Claude"""
1148
+
1149
+ def match(self, model_path: str):
1150
+ return model_path in ANTHROPIC_MODEL_LIST
1151
+
1152
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1153
+ raise NotImplementedError()
1154
+
1155
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1156
+ if "claude-3-haiku" in model_path:
1157
+ return get_conv_template("claude-3-haiku-20240307")
1158
+ if "claude-3-sonnet" in model_path:
1159
+ return get_conv_template("claude-3-sonnet-20240229")
1160
+ if "claude-3-opus" in model_path:
1161
+ return get_conv_template("claude-3-opus-20240229")
1162
+ return get_conv_template("claude")
1163
+
1164
+
1165
+ class BardAdapter(BaseModelAdapter):
1166
+ """The model adapter for Bard"""
1167
+
1168
+ def match(self, model_path: str):
1169
+ return model_path == "bard"
1170
+
1171
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1172
+ raise NotImplementedError()
1173
+
1174
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1175
+ return get_conv_template("bard")
1176
+
1177
+
1178
+ class PaLM2Adapter(BaseModelAdapter):
1179
+ """The model adapter for PaLM2"""
1180
+
1181
+ def match(self, model_path: str):
1182
+ return model_path == "palm-2"
1183
+
1184
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1185
+ raise NotImplementedError()
1186
+
1187
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1188
+ return get_conv_template("bard")
1189
+
1190
+
1191
+ class GeminiAdapter(BaseModelAdapter):
1192
+ """The model adapter for Gemini"""
1193
+
1194
+ def match(self, model_path: str):
1195
+ return "gemini" in model_path.lower() or "bard" in model_path.lower()
1196
+
1197
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1198
+ raise NotImplementedError()
1199
+
1200
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1201
+ return get_conv_template("gemini")
1202
+
1203
+
1204
+ class GeminiDevAdapter(BaseModelAdapter):
1205
+ """The model adapter for Gemini 1.5 Pro"""
1206
+
1207
+ def match(self, model_path: str):
1208
+ return "gemini-1.5-pro" in model_path.lower()
1209
+
1210
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1211
+ raise NotImplementedError()
1212
+
1213
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1214
+ return get_conv_template("gemini-dev")
1215
+
1216
+
1217
+ class BiLLaAdapter(BaseModelAdapter):
1218
+ """The model adapter for Neutralzz/BiLLa-7B-SFT"""
1219
+
1220
+ def match(self, model_path: str):
1221
+ return "billa" in model_path.lower()
1222
+
1223
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1224
+ return get_conv_template("billa")
1225
+
1226
+
1227
+ class RedPajamaINCITEAdapter(BaseModelAdapter):
1228
+ """The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""
1229
+
1230
+ def match(self, model_path: str):
1231
+ return "redpajama-incite" in model_path.lower()
1232
+
1233
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1234
+ revision = from_pretrained_kwargs.get("revision", "main")
1235
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
1236
+ model = AutoModelForCausalLM.from_pretrained(
1237
+ model_path,
1238
+ low_cpu_mem_usage=True,
1239
+ **from_pretrained_kwargs,
1240
+ )
1241
+ return model, tokenizer
1242
+
1243
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1244
+ return get_conv_template("redpajama-incite")
1245
+
1246
+
1247
+ class H2OGPTAdapter(BaseModelAdapter):
1248
+ """The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""
1249
+
1250
+ use_fast_tokenizer = False
1251
+
1252
+ def match(self, model_path: str):
1253
+ return "h2ogpt" in model_path.lower()
1254
+
1255
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1256
+ return get_conv_template("h2ogpt")
1257
+
1258
+
1259
+ class RobinAdapter(BaseModelAdapter):
1260
+ """The model adapter for LMFlow/Full-Robin-7b-v2"""
1261
+
1262
+ use_fast_tokenizer = False
1263
+
1264
+ def match(self, model_path: str):
1265
+ return "robin" in model_path.lower()
1266
+
1267
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1268
+ return get_conv_template("Robin")
1269
+
1270
+
1271
+ class SnoozyAdapter(BaseModelAdapter):
1272
+ """The model adapter for nomic-ai/gpt4all-13b-snoozy"""
1273
+
1274
+ use_fast_tokenizer = False
1275
+
1276
+ def match(self, model_path: str):
1277
+ model_path = model_path.lower()
1278
+ return "gpt4all" in model_path and "snoozy" in model_path
1279
+
1280
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1281
+ return get_conv_template("snoozy")
1282
+
1283
+
1284
+ class WizardLMAdapter(BaseModelAdapter):
1285
+ """The model adapter for WizardLM/WizardLM-13B-V1.0"""
1286
+
1287
+ use_fast_tokenizer = False
1288
+
1289
+ def match(self, model_path: str):
1290
+ return "wizardlm" in model_path.lower()
1291
+
1292
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1293
+ model_path = model_path.lower()
1294
+ if "13b" in model_path or "30b" in model_path or "70b" in model_path:
1295
+ return get_conv_template("vicuna_v1.1")
1296
+ else:
1297
+ # TODO: use the recommended template for 7B
1298
+ # (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
1299
+ return get_conv_template("one_shot")
1300
+
1301
+
1302
+ class ManticoreAdapter(BaseModelAdapter):
1303
+ """The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""
1304
+
1305
+ use_fast_tokenizer = False
1306
+
1307
+ def match(self, model_path: str):
1308
+ return "manticore" in model_path.lower()
1309
+
1310
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1311
+ return get_conv_template("manticore")
1312
+
1313
+
1314
+ class GuanacoAdapter(BaseModelAdapter):
1315
+ """The model adapter for timdettmers/guanaco-33b-merged"""
1316
+
1317
+ use_fast_tokenizer = False
1318
+
1319
+ def match(self, model_path: str):
1320
+ return "guanaco" in model_path.lower()
1321
+
1322
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1323
+ revision = from_pretrained_kwargs.get("revision", "main")
1324
+ tokenizer = AutoTokenizer.from_pretrained(
1325
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
1326
+ )
1327
+ model = AutoModelForCausalLM.from_pretrained(
1328
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1329
+ )
1330
+ # Fix a bug in tokenizer config
1331
+ tokenizer.eos_token_id = model.config.eos_token_id
1332
+ return model, tokenizer
1333
+
1334
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1335
+ return get_conv_template("zero_shot")
1336
+
1337
+
1338
+ class ChangGPTAdapter(BaseModelAdapter):
1339
+ """The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""
1340
+
1341
+ def match(self, model_path: str):
1342
+ model_path = model_path.lower()
1343
+ return "polyglot" in model_path and "chang" in model_path
1344
+
1345
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1346
+ return get_conv_template("polyglot_changgpt")
1347
+
1348
+
1349
+ class CamelAdapter(BaseModelAdapter):
1350
+ """The model adapter for camel-ai/CAMEL-13B-Combined-Data"""
1351
+
1352
+ use_fast_tokenizer = False
1353
+
1354
+ def match(self, model_path: str):
1355
+ return "camel" in model_path.lower()
1356
+
1357
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1358
+ return get_conv_template("vicuna_v1.1")
1359
+
1360
+
1361
+ class TuluAdapter(BaseModelAdapter):
1362
+ """The model adapter for allenai/tulu-30b"""
1363
+
1364
+ use_fast_tokenizer = False
1365
+
1366
+ def match(self, model_path: str):
1367
+ return "tulu" in model_path.lower()
1368
+
1369
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1370
+ return get_conv_template("tulu")
1371
+
1372
+
1373
+ class FalconAdapter(BaseModelAdapter):
1374
+ """The model adapter for tiiuae/falcon-40b"""
1375
+
1376
+ def match(self, model_path: str):
1377
+ return "falcon" in model_path.lower() and "chat" not in model_path.lower()
1378
+
1379
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1380
+ revision = from_pretrained_kwargs.get("revision", "main")
1381
+ # Strongly suggest using bf16, which is recommended by the author of Falcon
1382
+ tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
1383
+ model = AutoModelForCausalLM.from_pretrained(
1384
+ model_path,
1385
+ low_cpu_mem_usage=True,
1386
+ trust_remote_code=True,
1387
+ **from_pretrained_kwargs,
1388
+ )
1389
+ # In Falcon tokenizer config and special config there is not any pad token
1390
+ # Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
1391
+ tokenizer.pad_token_id = 9
1392
+ return model, tokenizer
1393
+
1394
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1395
+ return get_conv_template("falcon")
1396
+
1397
+
1398
+ class FalconChatAdapter(BaseModelAdapter):
1399
+ def match(self, model_path: str):
1400
+ return "falcon" in model_path.lower() and "chat" in model_path.lower()
1401
+
1402
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1403
+ return get_conv_template("falcon-chat")
1404
+
1405
+
1406
+ class TigerBotAdapter(BaseModelAdapter):
1407
+ """The model adapter for TigerResearch/tigerbot-7b-sft"""
1408
+
1409
+ def match(self, model_path: str):
1410
+ return "tigerbot" in model_path.lower()
1411
+
1412
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1413
+ revision = from_pretrained_kwargs.get("revision", "main")
1414
+ tokenizer = AutoTokenizer.from_pretrained(
1415
+ model_path,
1416
+ trust_remote_code=True,
1417
+ revision=revision,
1418
+ )
1419
+ model = AutoModelForCausalLM.from_pretrained(
1420
+ model_path,
1421
+ trust_remote_code=True,
1422
+ low_cpu_mem_usage=True,
1423
+ **from_pretrained_kwargs,
1424
+ )
1425
+ return model, tokenizer
1426
+
1427
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1428
+ return get_conv_template("tigerbot")
1429
+
1430
+
1431
+ class BaichuanAdapter(BaseModelAdapter):
1432
+ """The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""
1433
+
1434
+ def match(self, model_path: str):
1435
+ return "baichuan" in model_path.lower()
1436
+
1437
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1438
+ revision = from_pretrained_kwargs.get("revision", "main")
1439
+ tokenizer = AutoTokenizer.from_pretrained(
1440
+ model_path, trust_remote_code=True, revision=revision
1441
+ )
1442
+ model = AutoModelForCausalLM.from_pretrained(
1443
+ model_path,
1444
+ trust_remote_code=True,
1445
+ low_cpu_mem_usage=True,
1446
+ **from_pretrained_kwargs,
1447
+ )
1448
+ return model, tokenizer
1449
+
1450
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1451
+ # for Baichuan-13B-Chat
1452
+ if "chat" in model_path.lower():
1453
+ if "baichuan2" in model_path.lower():
1454
+ return get_conv_template("baichuan2-chat")
1455
+ return get_conv_template("baichuan-chat")
1456
+ return get_conv_template("zero_shot")
1457
+
1458
+
1459
+ class XGenAdapter(BaseModelAdapter):
1460
+ """The model adapter for Salesforce/xgen-7b"""
1461
+
1462
+ def match(self, model_path: str):
1463
+ return "xgen" in model_path.lower()
1464
+
1465
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1466
+ revision = from_pretrained_kwargs.get("revision", "main")
1467
+ model = AutoModelForCausalLM.from_pretrained(
1468
+ model_path,
1469
+ low_cpu_mem_usage=True,
1470
+ trust_remote_code=True,
1471
+ **from_pretrained_kwargs,
1472
+ )
1473
+ tokenizer = AutoTokenizer.from_pretrained(
1474
+ model_path, trust_remote_code=True, revision=revision
1475
+ )
1476
+ model.config.eos_token_id = 50256
1477
+ return model, tokenizer
1478
+
1479
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1480
+ return get_conv_template("xgen")
1481
+
1482
+
1483
+ class NousHermesAdapter(BaseModelAdapter):
1484
+ """The model adapter for NousResearch/Nous-Hermes-13b"""
1485
+
1486
+ use_fast_tokenizer = False
1487
+
1488
+ def match(self, model_path: str):
1489
+ return "nous-hermes" in model_path.lower()
1490
+
1491
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1492
+ return get_conv_template("alpaca")
1493
+
1494
+
1495
+ class InternLMChatAdapter(BaseModelAdapter):
1496
+ """The model adapter for internlm/internlm-chat-7b"""
1497
+
1498
+ def match(self, model_path: str):
1499
+ return "internlm" in model_path.lower()
1500
+
1501
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1502
+ revision = from_pretrained_kwargs.get("revision", "main")
1503
+ model = AutoModelForCausalLM.from_pretrained(
1504
+ model_path,
1505
+ low_cpu_mem_usage=True,
1506
+ trust_remote_code=True,
1507
+ **from_pretrained_kwargs,
1508
+ )
1509
+ model = model.eval()
1510
+ if "8k" in model_path.lower():
1511
+ model.config.max_sequence_length = 8192
1512
+ tokenizer = AutoTokenizer.from_pretrained(
1513
+ model_path, trust_remote_code=True, revision=revision
1514
+ )
1515
+ return model, tokenizer
1516
+
1517
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1518
+ return get_conv_template("internlm-chat")
1519
+
1520
+
1521
+ class StarChatAdapter(BaseModelAdapter):
1522
+ """The model adapter for HuggingFaceH4/starchat-beta"""
1523
+
1524
+ def match(self, model_path: str):
1525
+ return "starchat" in model_path.lower()
1526
+
1527
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1528
+ return get_conv_template("starchat")
1529
+
1530
+
1531
+ class MistralAdapter(BaseModelAdapter):
1532
+ """The model adapter for Mistral AI models"""
1533
+
1534
+ def match(self, model_path: str):
1535
+ return "mistral" in model_path.lower() or "mixtral" in model_path.lower()
1536
+
1537
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1538
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1539
+ model.config.eos_token_id = tokenizer.eos_token_id
1540
+ model.config.pad_token_id = tokenizer.pad_token_id
1541
+ return model, tokenizer
1542
+
1543
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1544
+ return get_conv_template("mistral")
1545
+
1546
+
1547
+ class Llama2Adapter(BaseModelAdapter):
1548
+ """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
1549
+
1550
+ def match(self, model_path: str):
1551
+ return "llama-2" in model_path.lower()
1552
+
1553
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1554
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1555
+ model.config.eos_token_id = tokenizer.eos_token_id
1556
+ model.config.pad_token_id = tokenizer.pad_token_id
1557
+ return model, tokenizer
1558
+
1559
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1560
+ return get_conv_template("llama-2")
1561
+
1562
+
1563
+ class Llama3Adapter(BaseModelAdapter):
1564
+ """The model adapter for Llama-3 (e.g., meta-llama/Meta-Llama-3-8B-Instruct)"""
1565
+
1566
+ def match(self, model_path: str):
1567
+ return "llama-3" in model_path.lower()
1568
+
1569
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1570
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
1571
+ model.config.eos_token_id = tokenizer.eos_token_id
1572
+ model.config.pad_token_id = tokenizer.pad_token_id
1573
+ return model, tokenizer
1574
+
1575
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1576
+ return get_conv_template("llama-3")
1577
+
1578
+
1579
+ class CuteGPTAdapter(BaseModelAdapter):
1580
+ """The model adapter for CuteGPT"""
1581
+
1582
+ def match(self, model_path: str):
1583
+ return "cutegpt" in model_path.lower()
1584
+
1585
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1586
+ tokenizer = LlamaTokenizer.from_pretrained(model_path)
1587
+ model = AutoModelForCausalLM.from_pretrained(
1588
+ model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
1589
+ )
1590
+ tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
1591
+ model.config.eos_token_id = tokenizer.eos_token_id
1592
+ model.config.pad_token_id = tokenizer.eos_token_id
1593
+ return model, tokenizer
1594
+
1595
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1596
+ return get_conv_template("cutegpt")
1597
+
1598
+
1599
+ class OpenOrcaAdapter(BaseModelAdapter):
1600
+ """Model adapter for Open-Orca models which may use different prompt templates
1601
+ - (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
1602
+ - `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
1603
+ - [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
1604
+ - `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
1605
+ format, with <|im_start|> and <|im_end|> tokens added to support this.
1606
+ - [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
1607
+ """
1608
+
1609
+ use_fast_tokenizer = False
1610
+
1611
+ def match(self, model_path: str):
1612
+ return (
1613
+ "mistral-7b-openorca" in model_path.lower()
1614
+ or "openorca" in model_path.lower()
1615
+ )
1616
+
1617
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1618
+ revision = from_pretrained_kwargs.get("revision", "main")
1619
+ tokenizer = AutoTokenizer.from_pretrained(
1620
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
1621
+ )
1622
+ model = AutoModelForCausalLM.from_pretrained(
1623
+ model_path,
1624
+ low_cpu_mem_usage=True,
1625
+ **from_pretrained_kwargs,
1626
+ ).eval()
1627
+ return model, tokenizer
1628
+
1629
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1630
+ if "mistral-7b-openorca" in model_path.lower():
1631
+ return get_conv_template("mistral-7b-openorca")
1632
+ return get_conv_template("open-orca")
1633
+
1634
+
1635
+ class DolphinAdapter(OpenOrcaAdapter):
1636
+ """Model adapter for ehartford/dolphin-2.2.1-mistral-7b"""
1637
+
1638
+ def match(self, model_path: str):
1639
+ return "dolphin" in model_path.lower() and "mistral" in model_path.lower()
1640
+
1641
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1642
+ return get_conv_template("dolphin-2.2.1-mistral-7b")
1643
+
1644
+
1645
+ class Hermes2Adapter(BaseModelAdapter):
1646
+ """Model adapter for teknium/OpenHermes-2.5-Mistral-7B and teknium/OpenHermes-2-Mistral-7B models"""
1647
+
1648
+ use_fast_tokenizer = False
1649
+
1650
+ def match(self, model_path: str):
1651
+ return any(
1652
+ model_str in model_path.lower()
1653
+ for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"]
1654
+ )
1655
+
1656
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1657
+ revision = from_pretrained_kwargs.get("revision", "main")
1658
+ tokenizer = AutoTokenizer.from_pretrained(
1659
+ model_path, use_fast=self.use_fast_tokenizer, revision=revision
1660
+ )
1661
+ model = AutoModelForCausalLM.from_pretrained(
1662
+ model_path,
1663
+ low_cpu_mem_usage=True,
1664
+ **from_pretrained_kwargs,
1665
+ ).eval()
1666
+ return model, tokenizer
1667
+
1668
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1669
+ return get_conv_template("OpenHermes-2.5-Mistral-7B")
1670
+
1671
+
1672
+ class NousHermes2MixtralAdapter(BaseModelAdapter):
1673
+ """Model adapter for NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO model"""
1674
+
1675
+ def match(self, model_path: str):
1676
+ return any(
1677
+ model_str in model_path.lower()
1678
+ for model_str in [
1679
+ "nous-hermes-2-mixtral-8x7b-dpo",
1680
+ "nous-hermes-2-mixtral-8x7b-sft",
1681
+ ]
1682
+ )
1683
+
1684
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1685
+ return get_conv_template("Nous-Hermes-2-Mixtral-8x7B-DPO")
1686
+
1687
+
1688
+ class WizardCoderAdapter(BaseModelAdapter):
1689
+ """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
1690
+
1691
+ use_fast_tokenizer = False
1692
+
1693
+ def match(self, model_path: str):
1694
+ return "wizardcoder" in model_path.lower()
1695
+
1696
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1697
+ # Same as Alpaca, see :
1698
+ # https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
1699
+ return get_conv_template("alpaca")
1700
+
1701
+
1702
+ class QwenChatAdapter(BaseModelAdapter):
1703
+ """The model adapter for Qwen/Qwen-7B-Chat
1704
+ To run this model, you need to ensure additional flash attention installation:
1705
+ ``` bash
1706
+ git clone https://github.com/Dao-AILab/flash-attention
1707
+ cd flash-attention && pip install .
1708
+ pip install csrc/layer_norm
1709
+ pip install csrc/rotary
1710
+ ```
1711
+
1712
+ Since from 2.0, the following change happened
1713
+ - `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
1714
+ - `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
1715
+ - `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
1716
+ You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
1717
+ to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
1718
+ """
1719
+
1720
+ def match(self, model_path: str):
1721
+ return "qwen" in model_path.lower()
1722
+
1723
+ def float_set(self, config, option):
1724
+ config.bf16 = False
1725
+ config.fp16 = False
1726
+ config.fp32 = False
1727
+
1728
+ if option == "bf16":
1729
+ config.bf16 = True
1730
+ elif option == "fp16":
1731
+ config.fp16 = True
1732
+ elif option == "fp32":
1733
+ config.fp32 = True
1734
+ else:
1735
+ print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")
1736
+
1737
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1738
+ from transformers.generation import GenerationConfig
1739
+
1740
+ revision = from_pretrained_kwargs.get("revision", "main")
1741
+ config = AutoConfig.from_pretrained(
1742
+ model_path,
1743
+ trust_remote_code=True,
1744
+ )
1745
+ # NOTE: if you use the old version of model file, please remove the comments below
1746
+ # config.use_flash_attn = False
1747
+ self.float_set(config, "fp16")
1748
+ generation_config = GenerationConfig.from_pretrained(
1749
+ model_path, trust_remote_code=True
1750
+ )
1751
+ model = AutoModelForCausalLM.from_pretrained(
1752
+ model_path,
1753
+ config=config,
1754
+ low_cpu_mem_usage=True,
1755
+ trust_remote_code=True,
1756
+ **from_pretrained_kwargs,
1757
+ ).eval()
1758
+ if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
1759
+ model.config.max_sequence_length = 16384
1760
+ tokenizer = AutoTokenizer.from_pretrained(
1761
+ model_path, trust_remote_code=True, revision=revision
1762
+ )
1763
+ tokenizer.eos_token_id = config.eos_token_id
1764
+ tokenizer.bos_token_id = config.bos_token_id
1765
+ tokenizer.pad_token_id = generation_config.pad_token_id
1766
+ model.config.eos_token_id = tokenizer.eos_token_id
1767
+ model.config.bos_token_id = tokenizer.bos_token_id
1768
+ model.config.pad_token_id = tokenizer.pad_token_id
1769
+
1770
+ return model, tokenizer
1771
+
1772
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1773
+ return get_conv_template("qwen-7b-chat")
1774
+
1775
+
1776
+ class SmaugChatAdapter(BaseModelAdapter):
1777
+ """The model adapter for abacusai/Smaug-2-72B."""
1778
+
1779
+ def match(self, model_path: str):
1780
+ return "smaug" in model_path.lower()
1781
+
1782
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1783
+ return get_conv_template("qwen-7b-chat")
1784
+
1785
+
1786
+ class BGEAdapter(BaseModelAdapter):
1787
+ """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
1788
+
1789
+ use_fast_tokenizer = False
1790
+
1791
+ def match(self, model_path: str):
1792
+ return "bge" in model_path.lower()
1793
+
1794
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1795
+ revision = from_pretrained_kwargs.get("revision", "main")
1796
+ model = AutoModel.from_pretrained(
1797
+ model_path,
1798
+ **from_pretrained_kwargs,
1799
+ )
1800
+ tokenizer = AutoTokenizer.from_pretrained(
1801
+ model_path, trust_remote_code=True, revision=revision
1802
+ )
1803
+ if hasattr(model.config, "max_position_embeddings") and hasattr(
1804
+ tokenizer, "model_max_length"
1805
+ ):
1806
+ model.config.max_sequence_length = min(
1807
+ model.config.max_position_embeddings, tokenizer.model_max_length
1808
+ )
1809
+ model.use_cls_pooling = True
1810
+ model.eval()
1811
+ return model, tokenizer
1812
+
1813
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1814
+ return get_conv_template("one_shot")
1815
+
1816
+
1817
+ class E5Adapter(BaseModelAdapter):
1818
+ """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
1819
+
1820
+ use_fast_tokenizer = False
1821
+
1822
+ def match(self, model_path: str):
1823
+ return "e5-" in model_path.lower()
1824
+
1825
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1826
+ revision = from_pretrained_kwargs.get("revision", "main")
1827
+ model = AutoModel.from_pretrained(
1828
+ model_path,
1829
+ **from_pretrained_kwargs,
1830
+ )
1831
+ tokenizer = AutoTokenizer.from_pretrained(
1832
+ model_path, trust_remote_code=True, revision=revision
1833
+ )
1834
+ if hasattr(model.config, "max_position_embeddings") and hasattr(
1835
+ tokenizer, "model_max_length"
1836
+ ):
1837
+ model.config.max_sequence_length = min(
1838
+ model.config.max_position_embeddings, tokenizer.model_max_length
1839
+ )
1840
+ return model, tokenizer
1841
+
1842
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1843
+ return get_conv_template("one_shot")
1844
+
1845
+
1846
+ class AquilaChatAdapter(BaseModelAdapter):
1847
+ """The model adapter for BAAI/Aquila
1848
+
1849
+ Now supports:
1850
+ - BAAI/AquilaChat-7B
1851
+ - BAAI/AquilaChat2-7B
1852
+ - BAAI/AquilaChat2-34B
1853
+ """
1854
+
1855
+ def match(self, model_path: str):
1856
+ return "aquila" in model_path.lower()
1857
+
1858
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1859
+ revision = from_pretrained_kwargs.get("revision", "main")
1860
+ model = AutoModelForCausalLM.from_pretrained(
1861
+ model_path,
1862
+ low_cpu_mem_usage=True,
1863
+ trust_remote_code=True,
1864
+ **from_pretrained_kwargs,
1865
+ )
1866
+ model = model.eval()
1867
+ tokenizer = AutoTokenizer.from_pretrained(
1868
+ model_path, trust_remote_code=True, revision=revision
1869
+ )
1870
+ return model, tokenizer
1871
+
1872
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1873
+ model_path = model_path.lower()
1874
+ # See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
1875
+ if "aquilachat2" in model_path:
1876
+ if "16k" in model_path:
1877
+ return get_conv_template("aquila")
1878
+ elif "34b" in model_path:
1879
+ return get_conv_template("aquila-legacy")
1880
+ else:
1881
+ return get_conv_template("aquila-v1")
1882
+ else:
1883
+ return get_conv_template("aquila-chat")
1884
+
1885
+
1886
+ class Lamma2ChineseAdapter(BaseModelAdapter):
1887
+ """The model adapter for FlagAlpha/LLama2-Chinese sft"""
1888
+
1889
+ def match(self, model_path: str):
1890
+ return "llama2-chinese" in model_path.lower()
1891
+
1892
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1893
+ revision = from_pretrained_kwargs.get("revision", "main")
1894
+ tokenizer = AutoTokenizer.from_pretrained(
1895
+ model_path,
1896
+ trust_remote_code=True,
1897
+ revision=revision,
1898
+ )
1899
+ model = AutoModelForCausalLM.from_pretrained(
1900
+ model_path,
1901
+ trust_remote_code=True,
1902
+ low_cpu_mem_usage=True,
1903
+ **from_pretrained_kwargs,
1904
+ )
1905
+ return model, tokenizer
1906
+
1907
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1908
+ return get_conv_template("llama2-chinese")
1909
+
1910
+
1911
+ class Lamma2ChineseAlpacaAdapter(BaseModelAdapter):
1912
+ """The model adapter for ymcui/Chinese-LLaMA-Alpaca sft"""
1913
+
1914
+ def match(self, model_path: str):
1915
+ return "chinese-alpaca" in model_path.lower()
1916
+
1917
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1918
+ revision = from_pretrained_kwargs.get("revision", "main")
1919
+ tokenizer = AutoTokenizer.from_pretrained(
1920
+ model_path,
1921
+ trust_remote_code=True,
1922
+ revision=revision,
1923
+ )
1924
+ model = AutoModelForCausalLM.from_pretrained(
1925
+ model_path,
1926
+ trust_remote_code=True,
1927
+ low_cpu_mem_usage=True,
1928
+ **from_pretrained_kwargs,
1929
+ )
1930
+ return model, tokenizer
1931
+
1932
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1933
+ return get_conv_template("chinese-alpaca2")
1934
+
1935
+
1936
+ class VigogneAdapter(BaseModelAdapter):
1937
+ """The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""
1938
+
1939
+ use_fast_tokenizer = False
1940
+
1941
+ def match(self, model_path: str):
1942
+ return bool(re.search(r"vigogne|vigostral", model_path, re.I))
1943
+
1944
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1945
+ revision = from_pretrained_kwargs.get("revision", "main")
1946
+ tokenizer = AutoTokenizer.from_pretrained(
1947
+ model_path,
1948
+ use_fast=self.use_fast_tokenizer,
1949
+ trust_remote_code=True,
1950
+ revision=revision,
1951
+ )
1952
+ model = AutoModelForCausalLM.from_pretrained(
1953
+ model_path,
1954
+ trust_remote_code=True,
1955
+ low_cpu_mem_usage=True,
1956
+ **from_pretrained_kwargs,
1957
+ ).eval()
1958
+ return model, tokenizer
1959
+
1960
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1961
+ if "chat" in model_path.lower():
1962
+ if "vigostral" in model_path.lower():
1963
+ return get_conv_template("vigogne_chat_v3")
1964
+ return get_conv_template("vigogne_chat_v2")
1965
+ return get_conv_template("vigogne_instruct")
1966
+
1967
+
1968
+ class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
1969
+ """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
1970
+
1971
+ use_fast_tokenizer = False
1972
+
1973
+ def match(self, model_path: str):
1974
+ return (
1975
+ "open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
1976
+ )
1977
+
1978
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
1979
+ revision = from_pretrained_kwargs.get("revision", "main")
1980
+ tokenizer = AutoTokenizer.from_pretrained(
1981
+ model_path,
1982
+ use_fast=self.use_fast_tokenizer,
1983
+ trust_remote_code=True,
1984
+ revision=revision,
1985
+ )
1986
+ model = AutoModelForCausalLM.from_pretrained(
1987
+ model_path,
1988
+ trust_remote_code=True,
1989
+ low_cpu_mem_usage=True,
1990
+ **from_pretrained_kwargs,
1991
+ ).eval()
1992
+ return model, tokenizer
1993
+
1994
+ def get_default_conv_template(self, model_path: str) -> Conversation:
1995
+ return get_conv_template("alpaca")
1996
+
1997
+
1998
+ class CodeLlamaAdapter(BaseModelAdapter):
1999
+ """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
2000
+
2001
+ def match(self, model_path: str):
2002
+ return "codellama" in model_path.lower()
2003
+
2004
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2005
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
2006
+ model.config.eos_token_id = tokenizer.eos_token_id
2007
+ model.config.pad_token_id = tokenizer.pad_token_id
2008
+ return model, tokenizer
2009
+
2010
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2011
+ return get_conv_template("llama-2")
2012
+
2013
+
2014
+ class StableVicunaAdapter(BaseModelAdapter):
2015
+ """The model adapter for StableVicuna"""
2016
+
2017
+ def match(self, model_path: str):
2018
+ return "stable-vicuna" in model_path.lower()
2019
+
2020
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2021
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
2022
+ model.config.eos_token_id = tokenizer.eos_token_id
2023
+ model.config.pad_token_id = tokenizer.pad_token_id
2024
+ return model, tokenizer
2025
+
2026
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2027
+ return get_conv_template("stable-vicuna")
2028
+
2029
+
2030
+ class PhindCodeLlamaAdapter(CodeLlamaAdapter):
2031
+ """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
2032
+
2033
+ def match(self, model_path: str):
2034
+ return "phind-codellama-" in model_path.lower()
2035
+
2036
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2037
+ return get_conv_template("phind")
2038
+
2039
+
2040
+ class Llama2ChangAdapter(Llama2Adapter):
2041
+ """The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
2042
+
2043
+ def match(self, model_path: str):
2044
+ return "llama2-ko-chang" in model_path.lower()
2045
+
2046
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2047
+ return get_conv_template("polyglot_changgpt")
2048
+
2049
+
2050
+ class ZephyrAdapter(BaseModelAdapter):
2051
+ """The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""
2052
+
2053
+ def match(self, model_path: str):
2054
+ return "zephyr" in model_path.lower()
2055
+
2056
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2057
+ return get_conv_template("zephyr")
2058
+
2059
+
2060
+ class NotusAdapter(BaseModelAdapter):
2061
+ """The model adapter for Notus (e.g. argilla/notus-7b-v1)"""
2062
+
2063
+ def match(self, model_path: str):
2064
+ return "notus" in model_path.lower()
2065
+
2066
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2067
+ return get_conv_template("zephyr")
2068
+
2069
+
2070
+ class CatPPTAdapter(BaseModelAdapter):
2071
+ """The model adapter for CatPPT (e.g. rishiraj/CatPPT)"""
2072
+
2073
+ def match(self, model_path: str):
2074
+ return "catppt" in model_path.lower()
2075
+
2076
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2077
+ return get_conv_template("catppt")
2078
+
2079
+
2080
+ class TinyLlamaAdapter(BaseModelAdapter):
2081
+ """The model adapter for TinyLlama (e.g. TinyLlama/TinyLlama-1.1B-Chat-v1.0)"""
2082
+
2083
+ def match(self, model_path: str):
2084
+ return "tinyllama" in model_path.lower()
2085
+
2086
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2087
+ return get_conv_template("TinyLlama")
2088
+
2089
+
2090
+ class XwinLMAdapter(BaseModelAdapter):
2091
+ """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""
2092
+
2093
+ # use_fast_tokenizer = False
2094
+
2095
+ def match(self, model_path: str):
2096
+ return "xwin-lm" in model_path.lower()
2097
+
2098
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2099
+ return get_conv_template("vicuna_v1.1")
2100
+
2101
+
2102
+ class LemurAdapter(BaseModelAdapter):
2103
+ """The model adapter for OpenLemur/lemur-70b-chat-v1"""
2104
+
2105
+ use_fast_tokenizer = False
2106
+
2107
+ def match(self, model_path: str):
2108
+ return "lemur-70b-chat" in model_path.lower()
2109
+
2110
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2111
+ return get_conv_template("lemur-70b-chat")
2112
+
2113
+
2114
+ class PygmalionAdapter(BaseModelAdapter):
2115
+ """The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""
2116
+
2117
+ # use_fast_tokenizer = False
2118
+
2119
+ def match(self, model_path: str):
2120
+ return bool(
2121
+ re.search(r"pygmalion|mythalion|metharme", model_path.lower(), re.I)
2122
+ )
2123
+
2124
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2125
+ return get_conv_template("metharme")
2126
+
2127
+
2128
+ class XdanAdapter(BaseModelAdapter):
2129
+ """The model adapter for xDAN-AI (e.g. xDAN-AI/xDAN-L1-Chat-RL-v1)"""
2130
+
2131
+ def match(self, model_path: str):
2132
+ return "xdan" in model_path.lower() and "v1" in model_path.lower()
2133
+
2134
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2135
+ return get_conv_template("xdan-v1")
2136
+
2137
+
2138
+ class MicrosoftOrcaAdapter(BaseModelAdapter):
2139
+ """The model adapter for Microsoft/Orca-2 series of models (e.g. Microsoft/Orca-2-7b, Microsoft/Orca-2-13b)"""
2140
+
2141
+ use_fast_tokenizer = False # Flag neeeded since tokenizers>=0.13.3 is required for a normal functioning of this module
2142
+
2143
+ def match(self, model_path: str):
2144
+ return "orca-2" in model_path.lower()
2145
+
2146
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2147
+ return get_conv_template("orca-2")
2148
+
2149
+
2150
+ class YiAdapter(BaseModelAdapter):
2151
+ """The model adapter for Yi models"""
2152
+
2153
+ def match(self, model_path: str):
2154
+ return "yi-" in model_path.lower() and "chat" in model_path.lower()
2155
+
2156
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2157
+ return get_conv_template("Yi-34b-chat")
2158
+
2159
+
2160
+ class DeepseekCoderAdapter(BaseModelAdapter):
2161
+ """The model adapter for deepseek-ai's coder models"""
2162
+
2163
+ def match(self, model_path: str):
2164
+ return "deepseek-coder" in model_path.lower()
2165
+
2166
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2167
+ return get_conv_template("deepseek-coder")
2168
+
2169
+
2170
+ class DeepseekChatAdapter(BaseModelAdapter):
2171
+ """The model adapter for deepseek-ai's chat models"""
2172
+
2173
+ # Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast
2174
+
2175
+ def match(self, model_path: str):
2176
+ return "deepseek-llm" in model_path.lower() and "chat" in model_path.lower()
2177
+
2178
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2179
+ return get_conv_template("deepseek-chat")
2180
+
2181
+
2182
+ class Yuan2Adapter(BaseModelAdapter):
2183
+ """The model adapter for Yuan2.0"""
2184
+
2185
+ def match(self, model_path: str):
2186
+ return "yuan2" in model_path.lower()
2187
+
2188
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2189
+ revision = from_pretrained_kwargs.get("revision", "main")
2190
+ # from_pretrained_kwargs["torch_dtype"] = torch.bfloat16
2191
+ tokenizer = LlamaTokenizer.from_pretrained(
2192
+ model_path,
2193
+ add_eos_token=False,
2194
+ add_bos_token=False,
2195
+ eos_token="<eod>",
2196
+ eod_token="<eod>",
2197
+ sep_token="<sep>",
2198
+ revision=revision,
2199
+ )
2200
+ tokenizer.add_tokens(
2201
+ [
2202
+ "<sep>",
2203
+ "<pad>",
2204
+ "<mask>",
2205
+ "<predict>",
2206
+ "<FIM_SUFFIX>",
2207
+ "<FIM_PREFIX>",
2208
+ "<FIM_MIDDLE>",
2209
+ "<commit_before>",
2210
+ "<commit_msg>",
2211
+ "<commit_after>",
2212
+ "<jupyter_start>",
2213
+ "<jupyter_text>",
2214
+ "<jupyter_code>",
2215
+ "<jupyter_output>",
2216
+ "<empty_output>",
2217
+ ],
2218
+ special_tokens=True,
2219
+ )
2220
+
2221
+ model = AutoModelForCausalLM.from_pretrained(
2222
+ model_path,
2223
+ # device_map='auto',
2224
+ trust_remote_code=True,
2225
+ **from_pretrained_kwargs,
2226
+ )
2227
+ return model, tokenizer
2228
+
2229
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2230
+ return get_conv_template("yuan2")
2231
+
2232
+
2233
+ class MetaMathAdapter(BaseModelAdapter):
2234
+ """The model adapter for MetaMath models"""
2235
+
2236
+ def match(self, model_path: str):
2237
+ return "metamath" in model_path.lower()
2238
+
2239
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2240
+ return get_conv_template("metamath")
2241
+
2242
+
2243
+ class BagelAdapter(BaseModelAdapter):
2244
+ """Model adapter for jondurbin/bagel-* models"""
2245
+
2246
+ def match(self, model_path: str):
2247
+ return "bagel" in model_path.lower()
2248
+
2249
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2250
+ return get_conv_template("airoboros_v3")
2251
+
2252
+
2253
+ class SolarAdapter(BaseModelAdapter):
2254
+ """The model adapter for upstage/SOLAR-10.7B-Instruct-v1.0"""
2255
+
2256
+ def match(self, model_path: str):
2257
+ return "solar-" in model_path.lower() and "instruct" in model_path.lower()
2258
+
2259
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2260
+ return get_conv_template("solar")
2261
+
2262
+
2263
+ class SteerLMAdapter(BaseModelAdapter):
2264
+ """The model adapter for nvidia/Llama2-70B-SteerLM-Chat"""
2265
+
2266
+ def match(self, model_path: str):
2267
+ return "steerlm-chat" in model_path.lower()
2268
+
2269
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2270
+ return get_conv_template("steerlm")
2271
+
2272
+
2273
+ class GemmaAdapter(BaseModelAdapter):
2274
+ """The model adapter for google/gemma"""
2275
+
2276
+ def match(self, model_path: str):
2277
+ return "gemma" in model_path.lower()
2278
+
2279
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2280
+ return get_conv_template("gemma")
2281
+
2282
+
2283
+ class LlavaAdapter(BaseModelAdapter):
2284
+ """The model adapter for liuhaotian/llava-v1.5 series of models"""
2285
+
2286
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2287
+ # TODO(chris): Implement huggingface-compatible load_model
2288
+ pass
2289
+
2290
+ def match(self, model_path: str):
2291
+ return "llava" in model_path.lower()
2292
+
2293
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2294
+ model_path = model_path.lower()
2295
+ if "34b" in model_path:
2296
+ return get_conv_template("llava-chatml")
2297
+
2298
+ return get_conv_template("vicuna_v1.1")
2299
+
2300
+
2301
+ class YuanAdapter(BaseModelAdapter):
2302
+ """The model adapter for Yuan"""
2303
+
2304
+ def match(self, model_path: str):
2305
+ return "yuan" in model_path.lower()
2306
+
2307
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2308
+ model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
2309
+ tokenizer.add_tokens(
2310
+ [
2311
+ "<sep>",
2312
+ "<pad>",
2313
+ "<mask>",
2314
+ "<predict>",
2315
+ "<FIM_SUFFIX>",
2316
+ "<FIM_PREFIX>",
2317
+ "<FIM_MIDDLE>",
2318
+ "<commit_before>",
2319
+ "<commit_msg>",
2320
+ "<commit_after>",
2321
+ "<jupyter_start>",
2322
+ "<jupyter_text>",
2323
+ "<jupyter_code>",
2324
+ "<jupyter_output>",
2325
+ "<empty_output>",
2326
+ ],
2327
+ special_tokens=True,
2328
+ )
2329
+ return model, tokenizer
2330
+
2331
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2332
+ return get_conv_template("yuan")
2333
+
2334
+
2335
+ class OlmoAdapter(BaseModelAdapter):
2336
+ """The model adapter for allenai/OLMo-7B-Instruct"""
2337
+
2338
+ def match(self, model_path: str):
2339
+ return "olmo" in model_path.lower()
2340
+
2341
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2342
+ return get_conv_template("api_based_default")
2343
+
2344
+
2345
+ class YandexGPTAdapter(BaseModelAdapter):
2346
+ """The model adapter for YandexGPT"""
2347
+
2348
+ def match(self, model_path: str):
2349
+ return "yandexgpt" in model_path.lower()
2350
+
2351
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2352
+ return get_conv_template("yandexgpt")
2353
+
2354
+
2355
+ class CllmAdapter(BaseModelAdapter):
2356
+ """The model adapter for CLLM"""
2357
+
2358
+ def match(self, model_path: str):
2359
+ return "consistency-llm" in model_path.lower()
2360
+
2361
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2362
+ config = AutoConfig.from_pretrained(
2363
+ model_path,
2364
+ )
2365
+
2366
+ tokenizer = AutoTokenizer.from_pretrained(
2367
+ model_path,
2368
+ model_max_length=2048,
2369
+ padding_side="right",
2370
+ )
2371
+
2372
+ model = AutoModelForCausalLM.from_pretrained(
2373
+ model_path,
2374
+ config=config,
2375
+ torch_dtype=torch.bfloat16,
2376
+ low_cpu_mem_usage=True,
2377
+ device_map="cuda",
2378
+ )
2379
+
2380
+ return model, tokenizer
2381
+
2382
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2383
+ return get_conv_template("cllm")
2384
+
2385
+
2386
+ class CohereAdapter(BaseModelAdapter):
2387
+ """The model adapter for Cohere"""
2388
+
2389
+ def match(self, model_path: str):
2390
+ return model_path in ["command-r"]
2391
+
2392
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2393
+ raise NotImplementedError()
2394
+
2395
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2396
+ return get_conv_template("api_based_default")
2397
+
2398
+
2399
+ class DBRXAdapter(BaseModelAdapter):
2400
+ """The model adapter for Cohere"""
2401
+
2402
+ def match(self, model_path: str):
2403
+ return model_path in ["dbrx-instruct"]
2404
+
2405
+ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
2406
+ raise NotImplementedError()
2407
+
2408
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2409
+ return get_conv_template("api_based_default")
2410
+
2411
+
2412
+ class RekaAdapter(BaseModelAdapter):
2413
+ """The model adapter for Reka"""
2414
+
2415
+ def match(self, model_path: str):
2416
+ return "reka" in model_path.lower()
2417
+
2418
+ def get_default_conv_template(self, model_path: str) -> Conversation:
2419
+ return get_conv_template("api_based_default")
2420
+
2421
+
2422
+ # Note: the registration order matters.
2423
+ # The one registered earlier has a higher matching priority.
2424
+ register_model_adapter(PeftModelAdapter)
2425
+ register_model_adapter(StableVicunaAdapter)
2426
+ register_model_adapter(VicunaAdapter)
2427
+ register_model_adapter(AiroborosAdapter)
2428
+ register_model_adapter(LongChatAdapter)
2429
+ register_model_adapter(GoogleT5Adapter)
2430
+ register_model_adapter(KoalaAdapter)
2431
+ register_model_adapter(AlpacaAdapter)
2432
+ register_model_adapter(ChatGLMAdapter)
2433
+ register_model_adapter(CodeGeexAdapter)
2434
+ register_model_adapter(DollyV2Adapter)
2435
+ register_model_adapter(OasstPythiaAdapter)
2436
+ register_model_adapter(OasstLLaMAAdapter)
2437
+ register_model_adapter(OpenChat35Adapter)
2438
+ register_model_adapter(TenyxChatAdapter)
2439
+ register_model_adapter(StableLMAdapter)
2440
+ register_model_adapter(BaizeAdapter)
2441
+ register_model_adapter(RwkvAdapter)
2442
+ register_model_adapter(OpenBuddyAdapter)
2443
+ register_model_adapter(PhoenixAdapter)
2444
+ register_model_adapter(BardAdapter)
2445
+ register_model_adapter(PaLM2Adapter)
2446
+ register_model_adapter(GeminiAdapter)
2447
+ register_model_adapter(GeminiDevAdapter)
2448
+ register_model_adapter(GemmaAdapter)
2449
+ register_model_adapter(ChatGPTAdapter)
2450
+ register_model_adapter(AzureOpenAIAdapter)
2451
+ register_model_adapter(ClaudeAdapter)
2452
+ register_model_adapter(MPTAdapter)
2453
+ register_model_adapter(BiLLaAdapter)
2454
+ register_model_adapter(RedPajamaINCITEAdapter)
2455
+ register_model_adapter(H2OGPTAdapter)
2456
+ register_model_adapter(RobinAdapter)
2457
+ register_model_adapter(SnoozyAdapter)
2458
+ register_model_adapter(WizardLMAdapter)
2459
+ register_model_adapter(ManticoreAdapter)
2460
+ register_model_adapter(GuanacoAdapter)
2461
+ register_model_adapter(CamelAdapter)
2462
+ register_model_adapter(ChangGPTAdapter)
2463
+ register_model_adapter(TuluAdapter)
2464
+ register_model_adapter(FalconChatAdapter)
2465
+ register_model_adapter(FalconAdapter)
2466
+ register_model_adapter(TigerBotAdapter)
2467
+ register_model_adapter(BaichuanAdapter)
2468
+ register_model_adapter(XGenAdapter)
2469
+ register_model_adapter(PythiaAdapter)
2470
+ register_model_adapter(InternLMChatAdapter)
2471
+ register_model_adapter(StarChatAdapter)
2472
+ register_model_adapter(Llama2Adapter)
2473
+ register_model_adapter(Llama3Adapter)
2474
+ register_model_adapter(CuteGPTAdapter)
2475
+ register_model_adapter(OpenOrcaAdapter)
2476
+ register_model_adapter(DolphinAdapter)
2477
+ register_model_adapter(Hermes2Adapter)
2478
+ register_model_adapter(NousHermes2MixtralAdapter)
2479
+ register_model_adapter(NousHermesAdapter)
2480
+ register_model_adapter(MistralAdapter)
2481
+ register_model_adapter(WizardCoderAdapter)
2482
+ register_model_adapter(QwenChatAdapter)
2483
+ register_model_adapter(AquilaChatAdapter)
2484
+ register_model_adapter(BGEAdapter)
2485
+ register_model_adapter(E5Adapter)
2486
+ register_model_adapter(Lamma2ChineseAdapter)
2487
+ register_model_adapter(Lamma2ChineseAlpacaAdapter)
2488
+ register_model_adapter(VigogneAdapter)
2489
+ register_model_adapter(OpenLLaMaOpenInstructAdapter)
2490
+ register_model_adapter(ReaLMAdapter)
2491
+ register_model_adapter(PhindCodeLlamaAdapter)
2492
+ register_model_adapter(CodeLlamaAdapter)
2493
+ register_model_adapter(Llama2ChangAdapter)
2494
+ register_model_adapter(ZephyrAdapter)
2495
+ register_model_adapter(NotusAdapter)
2496
+ register_model_adapter(CatPPTAdapter)
2497
+ register_model_adapter(TinyLlamaAdapter)
2498
+ register_model_adapter(XwinLMAdapter)
2499
+ register_model_adapter(LemurAdapter)
2500
+ register_model_adapter(PygmalionAdapter)
2501
+ register_model_adapter(MicrosoftOrcaAdapter)
2502
+ register_model_adapter(XdanAdapter)
2503
+ register_model_adapter(YiAdapter)
2504
+ register_model_adapter(PplxAIAdapter)
2505
+ register_model_adapter(DeepseekCoderAdapter)
2506
+ register_model_adapter(DeepseekChatAdapter)
2507
+ register_model_adapter(Yuan2Adapter)
2508
+ register_model_adapter(MetaMathAdapter)
2509
+ register_model_adapter(BagelAdapter)
2510
+ register_model_adapter(SolarAdapter)
2511
+ register_model_adapter(SteerLMAdapter)
2512
+ register_model_adapter(LlavaAdapter)
2513
+ register_model_adapter(YuanAdapter)
2514
+ register_model_adapter(OlmoAdapter)
2515
+ register_model_adapter(CohereAdapter)
2516
+ register_model_adapter(DBRXAdapter)
2517
+ register_model_adapter(GemmaAdapter)
2518
+ register_model_adapter(YandexGPTAdapter)
2519
+ register_model_adapter(CllmAdapter)
2520
+ register_model_adapter(RekaAdapter)
2521
+ register_model_adapter(SmaugChatAdapter)
2522
+
2523
+ # After all adapters, try the default base adapter.
2524
+ register_model_adapter(BaseModelAdapter)
src/model/model_chatglm.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference code for ChatGLM.
3
+ Adapted from https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py.
4
+ """
5
+ import re
6
+
7
+ import torch
8
+ from transformers.generation.logits_process import LogitsProcessor
9
+
10
+
11
+ class InvalidScoreLogitsProcessor(LogitsProcessor):
12
+ def __call__(
13
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor
14
+ ) -> torch.FloatTensor:
15
+ if torch.isnan(scores).any() or torch.isinf(scores).any():
16
+ scores.zero_()
17
+ scores[..., 5] = 5e4
18
+ return scores
19
+
20
+
21
+ invalid_score_processor = InvalidScoreLogitsProcessor()
22
+
23
+
24
+ def process_response(response):
25
+ response = response.strip()
26
+ response = response.replace("[[训练时间]]", "2023年")
27
+ punkts = [
28
+ [",", ","],
29
+ ["!", "!"],
30
+ [":", ":"],
31
+ [";", ";"],
32
+ ["\?", "?"],
33
+ ]
34
+ for item in punkts:
35
+ response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
36
+ response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
37
+ return response
38
+
39
+
40
+ def recover_message_list(prompt):
41
+ role_token_pattern = "|".join(
42
+ [re.escape(r) for r in ["<|system|>", "<|user|>", "<|assistant|>"]]
43
+ )
44
+ role = None
45
+ last_end_idx = -1
46
+ message_list = []
47
+ for match in re.finditer(role_token_pattern, prompt):
48
+ if role:
49
+ messge = {}
50
+ if role == "<|system|>":
51
+ messge["role"] = "system"
52
+ elif role == "<|user|>":
53
+ messge["role"] = "user"
54
+ else:
55
+ messge["role"] = "assistant"
56
+ messge["content"] = prompt[last_end_idx + 1 : match.start()]
57
+ message_list.append(messge)
58
+
59
+ role = prompt[match.start() : match.end()]
60
+ last_end_idx = match.end()
61
+
62
+ return message_list
63
+
64
+
65
+ @torch.inference_mode()
66
+ def generate_stream_chatglm(
67
+ model,
68
+ tokenizer,
69
+ params,
70
+ device,
71
+ context_len=2048,
72
+ stream_interval=2,
73
+ judge_sent_end=False,
74
+ ):
75
+ prompt = params["prompt"]
76
+ temperature = float(params.get("temperature", 1.0))
77
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
78
+ top_p = float(params.get("top_p", 1.0))
79
+ max_new_tokens = int(params.get("max_new_tokens", 256))
80
+ echo = params.get("echo", True)
81
+
82
+ model_type = str(type(model)).lower()
83
+ if "peft" in model_type:
84
+ model_type = str(type(model.base_model.model)).lower()
85
+
86
+ if "chatglm3" in model_type:
87
+ message_list = recover_message_list(prompt)
88
+ inputs = tokenizer.build_chat_input(
89
+ query=message_list[-1]["content"], history=message_list[:-1], role="user"
90
+ ).to(model.device)
91
+ else:
92
+ inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
93
+ input_echo_len = len(inputs["input_ids"][0])
94
+
95
+ gen_kwargs = {
96
+ "max_length": max_new_tokens + input_echo_len,
97
+ "do_sample": True if temperature > 1e-5 else False,
98
+ "top_p": top_p,
99
+ "repetition_penalty": repetition_penalty,
100
+ "logits_processor": [invalid_score_processor],
101
+ }
102
+ if temperature > 1e-5:
103
+ gen_kwargs["temperature"] = temperature
104
+
105
+ total_len = 0
106
+ for total_ids in model.stream_generate(**inputs, **gen_kwargs):
107
+ total_ids = total_ids.tolist()[0]
108
+ total_len = len(total_ids)
109
+ if echo:
110
+ output_ids = total_ids
111
+ else:
112
+ output_ids = total_ids[input_echo_len:]
113
+ response = tokenizer.decode(output_ids)
114
+ response = process_response(response)
115
+
116
+ yield {
117
+ "text": response,
118
+ "usage": {
119
+ "prompt_tokens": input_echo_len,
120
+ "completion_tokens": total_len - input_echo_len,
121
+ "total_tokens": total_len,
122
+ },
123
+ "finish_reason": None,
124
+ }
125
+
126
+ # TODO: ChatGLM stop when it reach max length
127
+ # Only last stream result contains finish_reason, we set finish_reason as stop
128
+ ret = {
129
+ "text": response,
130
+ "usage": {
131
+ "prompt_tokens": input_echo_len,
132
+ "completion_tokens": total_len - input_echo_len,
133
+ "total_tokens": total_len,
134
+ },
135
+ "finish_reason": "stop",
136
+ }
137
+ yield ret
src/model/model_cllm.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gc
3
+
4
+ import os
5
+ import time
6
+ import random
7
+ from typing import Dict, Optional, Sequence, List, Tuple
8
+ from transformers.cache_utils import Cache, DynamicCache
9
+ from transformers import (
10
+ LlamaModel,
11
+ LlamaForCausalLM,
12
+ GenerationConfig,
13
+ StoppingCriteria,
14
+ StoppingCriteriaList,
15
+ TextIteratorStreamer,
16
+ )
17
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
18
+ import torch.nn.functional as F
19
+
20
+
21
+ def get_jacobian_trajectory(
22
+ model, tokenizer, input_ids, attention_mask, max_new_tokens
23
+ ):
24
+ bsz = input_ids.shape[0]
25
+ prompt_len = [torch.sum(t) for t in attention_mask]
26
+ max_prompt_len = max(prompt_len)
27
+ total_len = max_prompt_len + max_new_tokens
28
+
29
+ # initialize the first point of jacobian trajectory
30
+ tokens = torch.full(
31
+ (bsz, total_len), tokenizer.pad_token_id, dtype=torch.long, device=model.device
32
+ )
33
+ for i in range(bsz):
34
+ tokens[i, :] = torch.tensor(
35
+ random.choices(input_ids[i][attention_mask[i] == 1], k=total_len),
36
+ dtype=torch.long,
37
+ device=model.device,
38
+ )
39
+ tokens[i, : prompt_len[i]] = input_ids[i][: prompt_len[i]].to(
40
+ dtype=torch.long, device=model.device
41
+ )
42
+ itr = 0
43
+ next_generation = tokens
44
+ generate_attention_mask = torch.full_like(next_generation, 1).to(model.device)
45
+ accurate_lengths = torch.tensor([prompt_len[i].item()] * bsz, device=model.device)
46
+ prev_len = 0
47
+ while True:
48
+ current_generation = next_generation
49
+ with torch.no_grad():
50
+ logits = model(current_generation, generate_attention_mask).logits
51
+ next_generation = torch.argmax(
52
+ torch.nn.functional.softmax(logits, dim=-1) / 0.001, dim=-1
53
+ )
54
+
55
+ # hold prompt unchanged and update generated tokens
56
+ for i in range(bsz):
57
+ next_generation[i, :] = torch.cat(
58
+ (
59
+ tokens[i, : prompt_len[i]],
60
+ next_generation[i, prompt_len[i] - 1 : total_len - 1],
61
+ ),
62
+ dim=0,
63
+ )
64
+
65
+ if (
66
+ torch.all(torch.eq(next_generation, current_generation)).item()
67
+ and itr == max_new_tokens
68
+ or len(
69
+ torch.where(
70
+ current_generation[0, : accurate_lengths[0]]
71
+ == tokenizer.eos_token_id
72
+ )[0]
73
+ )
74
+ > 0
75
+ ):
76
+ # forced exit due to max_new_tokens constraint or eos reached
77
+ return next_generation, itr
78
+
79
+ # skip the first itr, current_generation has not been updated yet
80
+ if itr != 0:
81
+ if torch.all(torch.eq(next_generation, current_generation)).item():
82
+ matched_position = total_len
83
+ else:
84
+ matched_position = (
85
+ torch.eq(current_generation, next_generation).squeeze(0) == False
86
+ ).nonzero(as_tuple=True)[0][0]
87
+ fast_forward_cnt = matched_position - accurate_lengths[0]
88
+
89
+ for i in range(bsz):
90
+ accurate_lengths[i] = matched_position.item()
91
+
92
+ # flush and print the first sequence
93
+ generated_str = tokenizer.decode(
94
+ next_generation[0, prompt_len[0] : accurate_lengths[0]],
95
+ skip_special_tokens=True,
96
+ spaces_between_special_tokens=False,
97
+ clean_up_tokenization_spaces=True,
98
+ )
99
+ print(generated_str[prev_len:], flush=True, end="")
100
+ prev_len = len(generated_str)
101
+
102
+ if torch.all(torch.eq(next_generation, current_generation)).item():
103
+ # early termination: itr < max_new_tokens
104
+ return next_generation, itr
105
+
106
+ itr += 1
107
+
108
+
109
+ def generate_stream_cllm(
110
+ model,
111
+ tokenizer,
112
+ params,
113
+ device,
114
+ context_len,
115
+ stream_interval=2,
116
+ judge_sent_end=False,
117
+ ):
118
+ # converge_step = []
119
+ prompt = params["prompt"]
120
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
121
+ max_new_tokens = int(params.get("n_token_seq_length", 32))
122
+ max_new_seq_len = int(params.get("max_new_tokens", 1024))
123
+
124
+ prompt_len = torch.sum(inputs["attention_mask"], dim=-1)
125
+ generation = inputs["input_ids"]
126
+ input_echo_len = len(generation)
127
+
128
+ ### generation phase
129
+ itr = 0
130
+ eos_reached = False
131
+ while True:
132
+ if itr == 0:
133
+ input_ids = inputs["input_ids"]
134
+ input_masks = inputs["attention_mask"]
135
+ else:
136
+ input_masks = torch.ones_like(input_ids).to(device)
137
+ for j in range(bsz):
138
+ input_masks[j][
139
+ torch.sum(inputs["attention_mask"], dim=-1)[j]
140
+ + itr * max_new_tokens :
141
+ ] = 0
142
+
143
+ bsz = input_ids.shape[0]
144
+ eos_reached = torch.tensor([False] * bsz, device=device)
145
+
146
+ generation, iter_steps = get_jacobian_trajectory(
147
+ model=model,
148
+ tokenizer=tokenizer,
149
+ input_ids=input_ids,
150
+ attention_mask=input_masks,
151
+ max_new_tokens=max_new_tokens,
152
+ )
153
+
154
+ ### inspect <eos>
155
+ for j in range(bsz):
156
+ prompt_len = torch.sum(input_masks, dim=-1)
157
+ eos_positions = torch.where(generation[j] == tokenizer.eos_token_id)[0]
158
+
159
+ if len(eos_positions) == 0:
160
+ # no EOS, continue to the next item in the batch
161
+ generation[j][prompt_len[j] + max_new_tokens :] = tokenizer.pad_token_id
162
+ continue
163
+ # otherwise, set tokens coming after EOS as pad
164
+ else:
165
+ if len(eos_positions) != 0:
166
+ eos_reached[j] = True
167
+ generation[j, int(eos_positions[0]) + 1 :] = tokenizer.pad_token_id
168
+
169
+ itr += 1
170
+
171
+ if all(eos_reached) or itr * max_new_tokens >= max_new_seq_len:
172
+ break
173
+ input_ids = generation[
174
+ torch.where(eos_reached == False)[0].tolist(), ...
175
+ ] # delete samples with <eos> generated
176
+
177
+ if all(eos_reached):
178
+ finish_reason = "eos"
179
+ elif itr * max_new_tokens > max_new_seq_len:
180
+ finish_reason = "length"
181
+ else:
182
+ finish_reason = "stop"
183
+
184
+ output = tokenizer.decode(input_ids[0], skip_special_tokens=False)
185
+
186
+ yield {
187
+ "text": "",
188
+ "usage": {
189
+ "prompt_tokens": input_echo_len,
190
+ "completion_tokens": itr * max_new_tokens,
191
+ "total_tokens": input_echo_len + itr * max_new_tokens,
192
+ },
193
+ "finish_reason": finish_reason,
194
+ }
195
+
196
+ # clean
197
+ gc.collect()
198
+ torch.cuda.empty_cache()
199
+ if device == "xpu":
200
+ torch.xpu.empty_cache()
201
+ if device == "npu":
202
+ torch.npu.empty_cache()
src/model/model_codet5p.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+ import torch
4
+ import transformers
5
+ from transformers import (
6
+ GenerationConfig,
7
+ StoppingCriteria,
8
+ StoppingCriteriaList,
9
+ TextIteratorStreamer,
10
+ )
11
+
12
+
13
+ @torch.inference_mode()
14
+ def generate_stream_codet5p(
15
+ model,
16
+ tokenizer,
17
+ params,
18
+ device,
19
+ context_len=2048,
20
+ stream_interval=2,
21
+ judge_sent_end=False,
22
+ ):
23
+ prompt = params["prompt"]
24
+ temperature = float(params.get("temperature", 1.0))
25
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
26
+ top_p = float(params.get("top_p", 1.0))
27
+ top_k = int(params.get("top_k", 50)) # -1 means disable
28
+ max_new_tokens = int(params.get("max_new_tokens", 1024))
29
+ stop_token_ids = params.get("stop_token_ids", None) or []
30
+ stop_token_ids.append(tokenizer.eos_token_id)
31
+
32
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
33
+ streamer = TextIteratorStreamer(tokenizer, **decode_config)
34
+ encoding = tokenizer(prompt, return_tensors="pt").to(device)
35
+ input_ids = encoding.input_ids
36
+ encoding["decoder_input_ids"] = encoding["input_ids"].clone()
37
+ input_echo_len = len(input_ids)
38
+
39
+ generation_config = GenerationConfig(
40
+ max_new_tokens=max_new_tokens,
41
+ do_sample=temperature >= 1e-5,
42
+ temperature=temperature,
43
+ repetition_penalty=repetition_penalty,
44
+ no_repeat_ngram_size=10,
45
+ top_p=top_p,
46
+ top_k=top_k,
47
+ eos_token_id=stop_token_ids,
48
+ )
49
+
50
+ class CodeBlockStopper(StoppingCriteria):
51
+ def __call__(
52
+ self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
53
+ ) -> bool:
54
+ # Code-completion is open-end generation.
55
+ # We check \n\n to stop at end of a code block.
56
+ if list(input_ids[0][-2:]) == [628, 198]:
57
+ return True
58
+ return False
59
+
60
+ gen_kwargs = dict(
61
+ **encoding,
62
+ streamer=streamer,
63
+ generation_config=generation_config,
64
+ stopping_criteria=StoppingCriteriaList([CodeBlockStopper()]),
65
+ )
66
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
67
+ thread.start()
68
+ i = 0
69
+ output = ""
70
+ for new_text in streamer:
71
+ i += 1
72
+ output += new_text
73
+ if i % stream_interval == 0 or i == max_new_tokens - 1:
74
+ yield {
75
+ "text": output,
76
+ "usage": {
77
+ "prompt_tokens": input_echo_len,
78
+ "completion_tokens": i,
79
+ "total_tokens": input_echo_len + i,
80
+ },
81
+ "finish_reason": None,
82
+ }
83
+ if i >= max_new_tokens:
84
+ break
85
+
86
+ if i >= max_new_tokens:
87
+ finish_reason = "length"
88
+ else:
89
+ finish_reason = "stop"
90
+
91
+ yield {
92
+ "text": output,
93
+ "usage": {
94
+ "prompt_tokens": input_echo_len,
95
+ "completion_tokens": i,
96
+ "total_tokens": input_echo_len + i,
97
+ },
98
+ "finish_reason": finish_reason,
99
+ }
100
+ thread.join()
101
+
102
+ # clean
103
+ gc.collect()
104
+ torch.cuda.empty_cache()
105
+ if device == "xpu":
106
+ torch.xpu.empty_cache()
107
+ if device == "npu":
108
+ torch.npu.empty_cache()
src/model/model_exllama.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import sys
3
+ from typing import Dict
4
+
5
+ import torch
6
+
7
+
8
+ def generate_stream_exllama(
9
+ model,
10
+ tokenizer,
11
+ params: Dict,
12
+ device: str,
13
+ context_len: int,
14
+ stream_interval: int = 2,
15
+ judge_sent_end: bool = False,
16
+ ):
17
+ try:
18
+ from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
19
+ except ImportError as e:
20
+ print(f"Error: Failed to load Exllamav2. {e}")
21
+ sys.exit(-1)
22
+
23
+ prompt = params["prompt"]
24
+
25
+ generator = ExLlamaV2StreamingGenerator(model.model, model.cache, tokenizer)
26
+ settings = ExLlamaV2Sampler.Settings()
27
+
28
+ settings.temperature = float(params.get("temperature", 0.85))
29
+ settings.top_k = int(params.get("top_k", 50))
30
+ settings.top_p = float(params.get("top_p", 0.8))
31
+ settings.token_repetition_penalty = float(params.get("repetition_penalty", 1.15))
32
+ settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
33
+
34
+ max_new_tokens = int(params.get("max_new_tokens", 256))
35
+
36
+ generator.set_stop_conditions(params.get("stop_token_ids", None) or [])
37
+ echo = bool(params.get("echo", True))
38
+
39
+ input_ids = generator.tokenizer.encode(prompt)
40
+ prompt_tokens = input_ids.shape[-1]
41
+ generator.begin_stream(input_ids, settings)
42
+
43
+ generated_tokens = 0
44
+ if echo:
45
+ output = prompt
46
+ else:
47
+ output = ""
48
+ while True:
49
+ chunk, eos, _ = generator.stream()
50
+ output += chunk
51
+ generated_tokens += 1
52
+ if generated_tokens == max_new_tokens:
53
+ finish_reason = "length"
54
+ break
55
+ elif eos:
56
+ finish_reason = "length"
57
+ break
58
+ yield {
59
+ "text": output,
60
+ "usage": {
61
+ "prompt_tokens": prompt_tokens,
62
+ "completion_tokens": generated_tokens,
63
+ "total_tokens": prompt_tokens + generated_tokens,
64
+ },
65
+ "finish_reason": None,
66
+ }
67
+
68
+ yield {
69
+ "text": output,
70
+ "usage": {
71
+ "prompt_tokens": prompt_tokens,
72
+ "completion_tokens": generated_tokens,
73
+ "total_tokens": prompt_tokens + generated_tokens,
74
+ },
75
+ "finish_reason": finish_reason,
76
+ }
77
+ gc.collect()
src/model/model_falcon.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+ from typing import Iterable
4
+
5
+ import torch
6
+ import transformers
7
+ from transformers import TextIteratorStreamer, GenerationConfig
8
+
9
+ from fastchat.utils import is_partial_stop
10
+
11
+
12
+ @torch.inference_mode()
13
+ def generate_stream_falcon(
14
+ model,
15
+ tokenizer,
16
+ params,
17
+ device,
18
+ context_len=2048,
19
+ stream_interval=2,
20
+ judge_sent_end=False,
21
+ ):
22
+ prompt = params["prompt"]
23
+ len_prompt = len(prompt)
24
+ temperature = float(params.get("temperature", 1.0))
25
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
26
+ top_p = float(params.get("top_p", 1.0))
27
+ top_k = int(params.get("top_k", 50)) # -1 means disable
28
+ max_new_tokens = int(params.get("max_new_tokens", 256))
29
+ stop_str = params.get("stop", None)
30
+ echo = bool(params.get("echo", True))
31
+ stop_token_ids = params.get("stop_token_ids", None) or []
32
+ stop_token_ids.append(tokenizer.eos_token_id)
33
+
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
+ input_ids = inputs["input_ids"]
36
+ attention_mask = inputs["attention_mask"]
37
+
38
+ max_src_len = context_len - max_new_tokens - 8
39
+
40
+ input_ids = input_ids[-max_src_len:] # truncate from the left
41
+ attention_mask = attention_mask[-max_src_len:] # truncate from the left
42
+ input_echo_len = len(input_ids)
43
+
44
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
45
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
46
+
47
+ generation_config = GenerationConfig(
48
+ max_new_tokens=max_new_tokens,
49
+ do_sample=temperature >= 1e-5,
50
+ temperature=temperature,
51
+ repetition_penalty=repetition_penalty,
52
+ no_repeat_ngram_size=10,
53
+ top_p=top_p,
54
+ top_k=top_k,
55
+ eos_token_id=stop_token_ids,
56
+ )
57
+
58
+ generation_kwargs = dict(
59
+ inputs=input_ids,
60
+ attention_mask=attention_mask,
61
+ streamer=streamer,
62
+ generation_config=generation_config,
63
+ )
64
+
65
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
66
+ thread.start()
67
+
68
+ if echo:
69
+ # means keep the prompt
70
+ output = prompt
71
+ else:
72
+ output = ""
73
+
74
+ for i, new_text in enumerate(streamer):
75
+ output += new_text
76
+ if i % stream_interval == 0:
77
+ if echo:
78
+ rfind_start = len_prompt
79
+ else:
80
+ rfind_start = 0
81
+
82
+ partially_stopped = False
83
+ if stop_str:
84
+ if isinstance(stop_str, str):
85
+ pos = output.rfind(stop_str, rfind_start)
86
+ if pos != -1:
87
+ output = output[:pos]
88
+ else:
89
+ partially_stopped = is_partial_stop(output, stop_str)
90
+ elif isinstance(stop_str, Iterable):
91
+ for each_stop in stop_str:
92
+ pos = output.rfind(each_stop, rfind_start)
93
+ if pos != -1:
94
+ output = output[:pos]
95
+ break
96
+ else:
97
+ partially_stopped = is_partial_stop(output, each_stop)
98
+ if partially_stopped:
99
+ break
100
+ else:
101
+ raise ValueError("Invalid stop field type.")
102
+
103
+ # prevent yielding partial stop sequence
104
+ if not partially_stopped:
105
+ yield {
106
+ "text": output,
107
+ "usage": {
108
+ "prompt_tokens": input_echo_len,
109
+ "completion_tokens": i,
110
+ "total_tokens": input_echo_len + i,
111
+ },
112
+ "finish_reason": None,
113
+ }
114
+ output = output.strip()
115
+
116
+ # finish stream event, which contains finish reason
117
+ if i == max_new_tokens - 1:
118
+ finish_reason = "length"
119
+ elif partially_stopped:
120
+ finish_reason = None
121
+ else:
122
+ finish_reason = "stop"
123
+
124
+ yield {
125
+ "text": output,
126
+ "usage": {
127
+ "prompt_tokens": input_echo_len,
128
+ "completion_tokens": i,
129
+ "total_tokens": input_echo_len + i,
130
+ },
131
+ "finish_reason": finish_reason,
132
+ }
133
+
134
+ # clean
135
+ gc.collect()
136
+ torch.cuda.empty_cache()
137
+ if device == "xpu":
138
+ torch.xpu.empty_cache()
139
+ if device == "npu":
140
+ torch.npu.empty_cache()
src/model/model_registry.py ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Additional information of the models."""
2
+ from collections import namedtuple, OrderedDict
3
+ from typing import List
4
+
5
+
6
+ ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"])
7
+
8
+
9
+ model_info = OrderedDict()
10
+
11
+
12
+ def register_model_info(
13
+ full_names: List[str], simple_name: str, link: str, description: str
14
+ ):
15
+ info = ModelInfo(simple_name, link, description)
16
+
17
+ for full_name in full_names:
18
+ model_info[full_name] = info
19
+
20
+
21
+ def get_model_info(name: str) -> ModelInfo:
22
+ if name in model_info:
23
+ return model_info[name]
24
+ else:
25
+ # To fix this, please use `register_model_info` to register your model
26
+ return ModelInfo(
27
+ name, "", "Register the description at fastchat/model/model_registry.py"
28
+ )
29
+
30
+
31
+ register_model_info(
32
+ [
33
+ "IEITYuan/Yuan2-2B-Janus-hf",
34
+ "IEITYuan/Yuan2-2B-hf",
35
+ "IEITYuan/Yuan2-51B-hf",
36
+ "IEITYuan/Yuan2-102B-hf",
37
+ ],
38
+ "IEIT-Yuan2",
39
+ "https://github.com/IEIT-Yuan/Yuan-2.0",
40
+ "Yuan2.0 is a new generation Fundamental Large Language Model developed by IEIT System.",
41
+ )
42
+
43
+ register_model_info(
44
+ [
45
+ "claude-3-haiku-20240307",
46
+ "claude-3-sonnet-20240229",
47
+ "claude-3-opus-20240229",
48
+ "claude-2.1",
49
+ "claude-2.0",
50
+ "claude-1",
51
+ ],
52
+ "Claude",
53
+ "https://www.anthropic.com/news/claude-3-family",
54
+ "Claude by Anthropic",
55
+ )
56
+
57
+ register_model_info(
58
+ ["reka-flash", "reka-flash-online"],
59
+ "Reka Flash",
60
+ "https://www.reka.ai/news/reka-flash-efficient-and-capable-multimodal-language-models",
61
+ "Multimodal model by Reka",
62
+ )
63
+
64
+ register_model_info(
65
+ ["command-r-plus"],
66
+ "Command-R-Plus",
67
+ "https://txt.cohere.com/command-r-plus-microsoft-azure/",
68
+ "Command-R Plus by Cohere",
69
+ )
70
+
71
+ register_model_info(
72
+ ["command-r"],
73
+ "Command-R",
74
+ "https://txt.cohere.com/command-r/",
75
+ "Command-R by Cohere",
76
+ )
77
+
78
+ register_model_info(
79
+ [
80
+ "zephyr-orpo-141b-A35b-v0.1",
81
+ ],
82
+ "Zephyr 141B-A35B",
83
+ "https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
84
+ "ORPO fine-tuned of Mixtral-8x22B-v0.1",
85
+ )
86
+
87
+ register_model_info(
88
+ ["gemma-1.1-7b-it", "gemma-1.1-2b-it", "gemma-7b-it", "gemma-2b-it"],
89
+ "Gemma",
90
+ "https://blog.google/technology/developers/gemma-open-models/",
91
+ "Gemma by Google",
92
+ )
93
+
94
+ register_model_info(
95
+ [
96
+ "mixtral-8x7b-instruct-v0.1",
97
+ "mistral-large-2402",
98
+ "mistral-medium",
99
+ "mistral-next",
100
+ "mistral-7b-instruct-v0.2",
101
+ "mistral-7b-instruct",
102
+ ],
103
+ "Mixtral of experts",
104
+ "https://mistral.ai/news/mixtral-of-experts/",
105
+ "A Mixture-of-Experts model by Mistral AI",
106
+ )
107
+
108
+ register_model_info(
109
+ [
110
+ "qwen1.5-72b-chat",
111
+ "qwen1.5-32b-chat",
112
+ "qwen1.5-14b-chat",
113
+ "qwen1.5-7b-chat",
114
+ "qwen1.5-4b-chat",
115
+ "qwen1.5-1.8b-chat",
116
+ "qwen1.5-0.5b-chat",
117
+ "qwen-14b-chat",
118
+ ],
119
+ "Qwen 1.5",
120
+ "https://qwenlm.github.io/blog/qwen1.5/",
121
+ "A large language model by Alibaba Cloud",
122
+ )
123
+
124
+
125
+ register_model_info(
126
+ ["dbrx-instruct"],
127
+ "DBRX Instruct",
128
+ "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm",
129
+ "DBRX by Databricks Mosaic AI",
130
+ )
131
+
132
+ register_model_info(
133
+ ["starling-lm-7b-beta", "starling-lm-7b-alpha"],
134
+ "Starling-LM-7B",
135
+ "https://starling.cs.berkeley.edu/",
136
+ "An open model trained using RLAIF by Berkeley",
137
+ )
138
+
139
+ register_model_info(
140
+ ["qwen-14b-chat"],
141
+ "Qwen",
142
+ "https://huggingface.co/Qwen",
143
+ "A large language model by Alibaba Cloud",
144
+ )
145
+
146
+ register_model_info(
147
+ ["bard-feb-2024", "bard-jan-24-gemini-pro"],
148
+ "Bard",
149
+ "https://bard.google.com/",
150
+ "Bard by Google",
151
+ )
152
+
153
+ register_model_info(
154
+ [
155
+ "gemini-pro",
156
+ "gemini-pro-dev-api",
157
+ "gemini-1.0-pro-vision",
158
+ "gemini-1.5-pro-preview-0409",
159
+ ],
160
+ "Gemini",
161
+ "https://blog.google/technology/ai/google-gemini-pro-imagen-duet-ai-update/",
162
+ "Gemini by Google",
163
+ )
164
+
165
+ register_model_info(
166
+ ["stripedhyena-nous-7b"],
167
+ "StripedHyena-Nous",
168
+ "https://huggingface.co/togethercomputer/StripedHyena-Nous-7B",
169
+ "A chat model developed by Together Research and Nous Research.",
170
+ )
171
+
172
+ register_model_info(
173
+ ["solar-10.7b-instruct-v1.0"],
174
+ "SOLAR-10.7B-Instruct",
175
+ "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0",
176
+ "A model trained using depth up-scaling by Upstage AI",
177
+ )
178
+
179
+ register_model_info(
180
+ [
181
+ "gpt-4-turbo",
182
+ "gpt-4-turbo-2024-04-09",
183
+ "gpt-4-1106-preview",
184
+ "gpt-4-0125-preview",
185
+ ],
186
+ "GPT-4-Turbo",
187
+ "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
188
+ "GPT-4-Turbo by OpenAI",
189
+ )
190
+
191
+ register_model_info(
192
+ ["gpt-4-turbo-browsing"],
193
+ "GPT-4-Turbo with browsing",
194
+ "https://platform.openai.com/docs/assistants/overview",
195
+ "GPT-4-Turbo with browsing by OpenAI",
196
+ )
197
+
198
+ register_model_info(
199
+ [
200
+ "gpt-3.5-turbo",
201
+ "gpt-3.5-turbo-0125",
202
+ "gpt-3.5-turbo-1106",
203
+ "gpt-3.5-turbo-0314",
204
+ "gpt-3.5-turbo-0613",
205
+ ],
206
+ "GPT-3.5",
207
+ "https://platform.openai.com/docs/models/gpt-3-5",
208
+ "GPT-3.5-Turbo by OpenAI",
209
+ )
210
+
211
+ register_model_info(
212
+ ["gpt-4", "gpt-4-0314", "gpt-4-0613"],
213
+ "GPT-4",
214
+ "https://openai.com/research/gpt-4",
215
+ "GPT-4 by OpenAI",
216
+ )
217
+
218
+ register_model_info(
219
+ ["claude-instant-1", "claude-instant-1.2"],
220
+ "Claude Instant",
221
+ "https://www.anthropic.com/index/introducing-claude",
222
+ "Claude Instant by Anthropic",
223
+ )
224
+
225
+ register_model_info(
226
+ ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
227
+ "Llama 2",
228
+ "https://ai.meta.com/llama/",
229
+ "Open foundation and fine-tuned chat models by Meta",
230
+ )
231
+
232
+ register_model_info(
233
+ ["olmo-7b-instruct"],
234
+ "OLMo-7B",
235
+ "https://huggingface.co/allenai/OLMo-7B-Instruct",
236
+ "OLMo by Allen AI",
237
+ )
238
+
239
+ register_model_info(
240
+ [
241
+ "vicuna-33b",
242
+ "vicuna-33b-v1.3",
243
+ "vicuna-13b",
244
+ "vicuna-13b-v1.5",
245
+ "vicuna-7b",
246
+ "vicuna-7b-v1.5",
247
+ ],
248
+ "Vicuna",
249
+ "https://lmsys.org/blog/2023-03-30-vicuna/",
250
+ "A chat assistant fine-tuned on user-shared conversations by LMSYS",
251
+ )
252
+
253
+ register_model_info(
254
+ ["yi-34b-chat", "yi-6b-chat"],
255
+ "Yi-Chat",
256
+ "https://huggingface.co/01-ai/Yi-34B-Chat",
257
+ "A large language model by 01 AI",
258
+ )
259
+
260
+ register_model_info(
261
+ [
262
+ "codellama-70b-instruct",
263
+ "codellama-34b-instruct",
264
+ "codellama-13b-instruct",
265
+ "codellama-7b-instruct",
266
+ ],
267
+ "Code Llama",
268
+ "https://ai.meta.com/blog/code-llama-large-language-model-coding/",
269
+ "Open foundation models for code by Meta",
270
+ )
271
+
272
+ register_model_info(
273
+ ["openchat-3.5-0106", "openchat-3.5"],
274
+ "OpenChat 3.5",
275
+ "https://github.com/imoneoi/openchat",
276
+ "An open model fine-tuned on Mistral-7B using C-RLFT",
277
+ )
278
+
279
+ register_model_info(
280
+ ["deepseek-llm-67b-chat"],
281
+ "DeepSeek LLM",
282
+ "https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat",
283
+ "An advanced language model by DeepSeek",
284
+ )
285
+
286
+ register_model_info(
287
+ ["stripedhyena-nous-7b"],
288
+ "StripedHyena-Nous",
289
+ "https://huggingface.co/togethercomputer/StripedHyena-Nous-7B",
290
+ "A chat model developed by Together Research and Nous Research.",
291
+ )
292
+
293
+ register_model_info(
294
+ ["nous-hermes-2-mixtral-8x7b-dpo"],
295
+ "Nous-Hermes-2-Mixtral-8x7B-DPO",
296
+ "https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
297
+ "Nous Hermes finetuned from Mixtral 8x7B",
298
+ )
299
+
300
+
301
+ register_model_info(
302
+ ["llama2-70b-steerlm-chat"],
303
+ "Llama2-70B-SteerLM-Chat",
304
+ "https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat",
305
+ "A Llama fine-tuned with SteerLM method by NVIDIA",
306
+ )
307
+
308
+ register_model_info(
309
+ ["pplx-70b-online", "pplx-7b-online"],
310
+ "pplx-online-llms",
311
+ "https://blog.perplexity.ai/blog/introducing-pplx-online-llms",
312
+ "Online LLM API by Perplexity AI",
313
+ )
314
+
315
+ register_model_info(
316
+ ["openhermes-2.5-mistral-7b"],
317
+ "OpenHermes-2.5-Mistral-7B",
318
+ "https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B",
319
+ "A mistral-based model fine-tuned on 1M GPT-4 outputs",
320
+ )
321
+
322
+ register_model_info(
323
+ ["tulu-2-dpo-70b"],
324
+ "Tulu 2",
325
+ "https://huggingface.co/allenai/tulu-2-dpo-70b",
326
+ "An instruction and RLHF model by UW/AllenAI",
327
+ )
328
+
329
+ register_model_info(
330
+ ["chatglm3-6b", "chatglm2-6b", "chatglm-6b"],
331
+ "ChatGLM",
332
+ "https://chatglm.cn/blog",
333
+ "An open bilingual dialogue language model by Tsinghua University",
334
+ )
335
+
336
+ register_model_info(
337
+ ["tenyxchat-7b-v1"],
338
+ "TenyxChat-7B",
339
+ "https://huggingface.co/tenyx/TenyxChat-7B-v1",
340
+ "An open model DPO trained on top of OpenChat-3.5 using Tenyx fine-tuning",
341
+ )
342
+
343
+ register_model_info(
344
+ ["zephyr-7b-beta", "zephyr-7b-alpha"],
345
+ "Zephyr",
346
+ "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha",
347
+ "A chatbot fine-tuned from Mistral by Hugging Face",
348
+ )
349
+
350
+ register_model_info(
351
+ ["notus-7b-v1"],
352
+ "Notus",
353
+ "https://huggingface.co/argilla/notus-7b-v1",
354
+ "A chatbot fine-tuned from Zephyr SFT by Argilla",
355
+ )
356
+
357
+ register_model_info(
358
+ ["catppt"],
359
+ "CatPPT",
360
+ "https://huggingface.co/rishiraj/CatPPT",
361
+ "A chatbot fine-tuned from a SLERP merged model by Rishiraj Acharya",
362
+ )
363
+
364
+ register_model_info(
365
+ ["TinyLlama"],
366
+ "TinyLlama",
367
+ "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
368
+ "The TinyLlama project is an open endeavor to pretrain a 1.1B Llama model on 3 trillion tokens.",
369
+ )
370
+
371
+ register_model_info(
372
+ ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
373
+ "WizardLM",
374
+ "https://github.com/nlpxucan/WizardLM",
375
+ "An instruction-following LLM using evol-instruct by Microsoft",
376
+ )
377
+
378
+ register_model_info(
379
+ ["wizardcoder-15b-v1.0"],
380
+ "WizardLM",
381
+ "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder",
382
+ "Empowering Code Large Language Models with Evol-Instruct",
383
+ )
384
+
385
+ register_model_info(
386
+ ["mpt-7b-chat", "mpt-30b-chat"],
387
+ "MPT-Chat",
388
+ "https://www.mosaicml.com/blog/mpt-30b",
389
+ "A chatbot fine-tuned from MPT by MosaicML",
390
+ )
391
+
392
+ register_model_info(
393
+ ["guanaco-33b", "guanaco-65b"],
394
+ "Guanaco",
395
+ "https://github.com/artidoro/qlora",
396
+ "A model fine-tuned with QLoRA by UW",
397
+ )
398
+
399
+ register_model_info(
400
+ ["gpt4all-13b-snoozy"],
401
+ "GPT4All-Snoozy",
402
+ "https://github.com/nomic-ai/gpt4all",
403
+ "A finetuned LLaMA model on assistant style data by Nomic AI",
404
+ )
405
+
406
+ register_model_info(
407
+ ["koala-13b"],
408
+ "Koala",
409
+ "https://bair.berkeley.edu/blog/2023/04/03/koala",
410
+ "A dialogue model for academic research by BAIR",
411
+ )
412
+
413
+ register_model_info(
414
+ ["RWKV-4-Raven-14B"],
415
+ "RWKV-4-Raven",
416
+ "https://huggingface.co/BlinkDL/rwkv-4-raven",
417
+ "An RNN with transformer-level LLM performance",
418
+ )
419
+
420
+ register_model_info(
421
+ ["alpaca-13b"],
422
+ "Alpaca",
423
+ "https://crfm.stanford.edu/2023/03/13/alpaca.html",
424
+ "A model fine-tuned from LLaMA on instruction-following demonstrations by Stanford",
425
+ )
426
+
427
+ register_model_info(
428
+ ["oasst-pythia-12b"],
429
+ "OpenAssistant (oasst)",
430
+ "https://open-assistant.io",
431
+ "An Open Assistant for everyone by LAION",
432
+ )
433
+
434
+ register_model_info(
435
+ ["oasst-sft-7-llama-30b"],
436
+ "OpenAssistant (oasst)",
437
+ "https://open-assistant.io",
438
+ "An Open Assistant for everyone by LAION",
439
+ )
440
+
441
+ register_model_info(
442
+ ["palm-2"],
443
+ "PaLM 2 Chat",
444
+ "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023",
445
+ "PaLM 2 for Chat (chat-bison@001) by Google",
446
+ )
447
+
448
+ register_model_info(
449
+ ["llama-7b", "llama-13b"],
450
+ "LLaMA",
451
+ "https://arxiv.org/abs/2302.13971",
452
+ "Open and efficient foundation language models by Meta",
453
+ )
454
+
455
+ register_model_info(
456
+ ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"],
457
+ "Open LLaMa (Open Instruct)",
458
+ "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc",
459
+ "Open LLaMa fine-tuned on instruction-following data by VMware",
460
+ )
461
+
462
+ register_model_info(
463
+ ["dolly-v2-12b"],
464
+ "Dolly",
465
+ "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm",
466
+ "An instruction-tuned open large language model by Databricks",
467
+ )
468
+
469
+ register_model_info(
470
+ ["stablelm-tuned-alpha-7b"],
471
+ "StableLM",
472
+ "https://github.com/stability-AI/stableLM",
473
+ "Stability AI language models",
474
+ )
475
+
476
+ register_model_info(
477
+ ["codet5p-6b"],
478
+ "CodeT5p-6b",
479
+ "https://huggingface.co/Salesforce/codet5p-6b",
480
+ "Code completion model released by Salesforce",
481
+ )
482
+
483
+ register_model_info(
484
+ ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"],
485
+ "FastChat-T5",
486
+ "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0",
487
+ "A chat assistant fine-tuned from FLAN-T5 by LMSYS",
488
+ )
489
+
490
+ register_model_info(
491
+ ["phoenix-inst-chat-7b"],
492
+ "Phoenix-7B",
493
+ "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b",
494
+ "A multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)",
495
+ )
496
+
497
+ register_model_info(
498
+ ["realm-7b-v1"],
499
+ "ReaLM",
500
+ "https://github.com/FreedomIntelligence/ReaLM",
501
+ "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.",
502
+ )
503
+
504
+ register_model_info(
505
+ ["billa-7b-sft"],
506
+ "BiLLa-7B-SFT",
507
+ "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
508
+ "An instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
509
+ )
510
+
511
+ register_model_info(
512
+ ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
513
+ "h2oGPT-GM-7b",
514
+ "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
515
+ "An instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
516
+ )
517
+
518
+ register_model_info(
519
+ ["baize-v2-7b", "baize-v2-13b"],
520
+ "Baize v2",
521
+ "https://github.com/project-baize/baize-chatbot#v2",
522
+ "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.",
523
+ )
524
+
525
+ register_model_info(
526
+ [
527
+ "airoboros-l2-7b-2.1",
528
+ "airoboros-l2-13b-2.1",
529
+ "airoboros-c34b-2.1",
530
+ "airoboros-l2-70b-2.1",
531
+ ],
532
+ "airoboros",
533
+ "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
534
+ "An instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
535
+ )
536
+
537
+ register_model_info(
538
+ [
539
+ "spicyboros-7b-2.2",
540
+ "spicyboros-13b-2.2",
541
+ "spicyboros-70b-2.2",
542
+ ],
543
+ "spicyboros",
544
+ "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
545
+ "De-aligned versions of the airoboros models",
546
+ )
547
+
548
+ register_model_info(
549
+ ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
550
+ "Robin-v2",
551
+ "https://huggingface.co/OptimalScale/robin-7b-v2-delta",
552
+ "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.",
553
+ )
554
+
555
+ register_model_info(
556
+ ["manticore-13b-chat"],
557
+ "Manticore 13B Chat",
558
+ "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg",
559
+ "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.",
560
+ )
561
+
562
+ register_model_info(
563
+ ["redpajama-incite-7b-chat"],
564
+ "RedPajama-INCITE-7B-Chat",
565
+ "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat",
566
+ "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
567
+ )
568
+
569
+ register_model_info(
570
+ [
571
+ "falcon-7b",
572
+ "falcon-7b-instruct",
573
+ "falcon-40b",
574
+ "falcon-40b-instruct",
575
+ "falcon-180b",
576
+ "falcon-180b-chat",
577
+ ],
578
+ "Falcon",
579
+ "https://huggingface.co/tiiuae/falcon-180B",
580
+ "TII's flagship series of large language models",
581
+ )
582
+
583
+ register_model_info(
584
+ ["tigerbot-7b-sft"],
585
+ "Tigerbot",
586
+ "https://huggingface.co/TigerResearch/tigerbot-7b-sft",
587
+ "A large-scale language model (LLM) with multiple languages and tasks.",
588
+ )
589
+
590
+ register_model_info(
591
+ ["internlm-chat-7b", "internlm-chat-7b-8k"],
592
+ "InternLM",
593
+ "https://huggingface.co/internlm/internlm-chat-7b",
594
+ "A multi-language large-scale language model (LLM), developed by SHLAB.",
595
+ )
596
+
597
+ register_model_info(
598
+ ["Qwen-7B-Chat"],
599
+ "Qwen",
600
+ "https://huggingface.co/Qwen/Qwen-7B-Chat",
601
+ "A multi-language large-scale language model (LLM), developed by Damo Academy.",
602
+ )
603
+
604
+ register_model_info(
605
+ ["smaug-2-72b"],
606
+ "Smaug-2-72B",
607
+ "https://huggingface.co/abacusai/Smaug-2-72B",
608
+ "An open model trained by Abacus.AI.",
609
+ )
610
+
611
+ register_model_info(
612
+ ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"],
613
+ "Llama2-Chinese",
614
+ "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat",
615
+ "A multi-language large-scale language model (LLM), developed by FlagAlpha.",
616
+ )
617
+
618
+ register_model_info(
619
+ ["Meta-Llama-3-8B-Instruct", "Meta-Llama-3-70B-Instruct"],
620
+ "llama-3",
621
+ "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
622
+ "Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes.",
623
+ )
624
+
625
+ register_model_info(
626
+ ["Chinese-Alpaca-2-7B", "Chinese-Alpaca-2-13B"],
627
+ "Chinese-Alpaca",
628
+ "https://huggingface.co/hfl/chinese-alpaca-2-13b",
629
+ "New extended Chinese vocabulary beyond Llama-2, open-sourcing the Chinese LLaMA-2 and Alpaca-2 LLMs.",
630
+ )
631
+
632
+ register_model_info(
633
+ ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"],
634
+ "Vigogne-Instruct",
635
+ "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct",
636
+ "A French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang",
637
+ )
638
+
639
+ register_model_info(
640
+ ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"],
641
+ "Vigogne-Chat",
642
+ "https://huggingface.co/bofenghuang/vigogne-2-7b-chat",
643
+ "A French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang",
644
+ )
645
+
646
+ register_model_info(
647
+ ["stable-vicuna-13B-HF"],
648
+ "stable-vicuna",
649
+ "https://huggingface.co/TheBloke/stable-vicuna-13B-HF",
650
+ "A Vicuna model fine-tuned using RLHF via PPO on various conversational and instructional datasets.",
651
+ )
652
+
653
+ register_model_info(
654
+ ["deluxe-chat-v1", "deluxe-chat-v1.1", "deluxe-chat-v1.2", "deluxe-chat-v1.3"],
655
+ "DeluxeChat",
656
+ "",
657
+ "Deluxe Chat",
658
+ )
659
+
660
+ register_model_info(
661
+ [
662
+ "Xwin-LM-7B-V0.1",
663
+ "Xwin-LM-13B-V0.1",
664
+ "Xwin-LM-70B-V0.1",
665
+ "Xwin-LM-7B-V0.2",
666
+ "Xwin-LM-13B-V0.2",
667
+ ],
668
+ "Xwin-LM",
669
+ "https://github.com/Xwin-LM/Xwin-LM",
670
+ "Chat models developed by Xwin-LM team",
671
+ )
672
+
673
+ register_model_info(
674
+ ["lemur-70b-chat"],
675
+ "Lemur-Chat",
676
+ "https://huggingface.co/OpenLemur/lemur-70b-chat-v1",
677
+ "An openly accessible language model optimized for both natural language and coding capabilities ",
678
+ )
679
+
680
+ register_model_info(
681
+ ["Mistral-7B-OpenOrca"],
682
+ "Open-Orca",
683
+ "https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca",
684
+ "A fine-tune of [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) using [OpenOrca dataset](https://huggingface.co/datasets/Open-Orca/OpenOrca)",
685
+ )
686
+
687
+ register_model_info(
688
+ ["dolphin-2.2.1-mistral-7b"],
689
+ "dolphin-mistral",
690
+ "https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b",
691
+ "An uncensored fine-tuned Mistral 7B",
692
+ )
693
+
694
+ register_model_info(
695
+ [
696
+ "AquilaChat-7B",
697
+ "AquilaChat2-7B",
698
+ "AquilaChat2-34B",
699
+ ],
700
+ "Aquila-Chat",
701
+ "https://huggingface.co/BAAI/AquilaChat2-34B",
702
+ "Chat models developed by BAAI team",
703
+ )
704
+
705
+ register_model_info(
706
+ ["xDAN-L1-Chat-RL-v1"],
707
+ "xDAN-L1-Chat",
708
+ "https://huggingface.co/xDAN-AI/xDAN-L1-Chat-RL-v1",
709
+ "A large language chat model created by xDAN-AI.",
710
+ )
711
+
712
+ register_model_info(
713
+ ["MetaMath-70B-V1.0", "MetaMath-7B-V1.0"],
714
+ "MetaMath",
715
+ "https://huggingface.co/meta-math",
716
+ "A finetune of Llama2 on [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA) that specializes in mathematical reasoning.",
717
+ )
718
+
719
+ register_model_info(
720
+ ["Yuan2-2B-hf", "Yuan2-51B-hf", "Yuan2-102B-hf"],
721
+ "IEIYuan",
722
+ "https://huggingface.co/IEITYuan",
723
+ "A Basemodel developed by IEI.",
724
+ )
725
+
726
+ register_model_info(
727
+ [
728
+ "llava-v1.6-34b",
729
+ "llava-v1.6-vicuna-13b",
730
+ "llava-v1.6-vicuna-7b",
731
+ "llava-v1.6-mistral-7b",
732
+ "llava-v1.5-13b",
733
+ "llava-v1.5-7b",
734
+ ],
735
+ "LLaVA",
736
+ "https://github.com/haotian-liu/LLaVA",
737
+ "an open large language and vision assistant",
738
+ )
739
+
740
+ register_model_info(
741
+ ["gemma-7b-it", "gemma-2b-it"],
742
+ "Gemma",
743
+ "https://blog.google/technology/developers/gemma-open-models/",
744
+ "Gemma by Google",
745
+ )
746
+
747
+ register_model_info(
748
+ [
749
+ "cllm/consistency-llm-7b-codesearchnet",
750
+ "cllm/consistency-llm-7b-gsm8k",
751
+ "cllm/consistency-llm-7b-sharegpt48k",
752
+ "cllm/consistency-llm-7b-spider",
753
+ ],
754
+ "consistency-llm",
755
+ "https://huggingface.co/cllm",
756
+ "consistency-llm is a new generation of parallel decoder LLMs with fast generation speed.",
757
+ )
758
+
759
+ register_model_info(
760
+ ["reka-flash", "reka-flash-20240226"],
761
+ "Reka Flash",
762
+ "https://reka.ai/reka-flash",
763
+ "Multimodal model by Reka",
764
+ )
src/model/model_xfastertransformer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+
4
+ import torch
5
+ from transformers import TextIteratorStreamer
6
+
7
+
8
+ @torch.inference_mode()
9
+ def generate_stream_xft(
10
+ model,
11
+ tokenizer,
12
+ params,
13
+ device,
14
+ context_len=8192,
15
+ stream_interval=2,
16
+ judge_sent_end=False,
17
+ ):
18
+ prompt = params["prompt"]
19
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
20
+
21
+ # unused now, and placehold for future.
22
+ # temperature = float(params.get("temperature", 1.0))
23
+ # top_p = float(params.get("top_p", 1.0))
24
+
25
+ max_new_tokens = int(params.get("max_new_tokens", 4096))
26
+ echo = params.get("echo", True)
27
+
28
+ inputs = tokenizer(
29
+ prompt, return_tensors="pt", padding=model.config.padding
30
+ ).input_ids
31
+ input_echo_len = len(inputs[0])
32
+ max_len = max_new_tokens + input_echo_len
33
+
34
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
35
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
36
+ generation_kwargs = {
37
+ "input_ids": inputs,
38
+ "streamer": streamer,
39
+ "max_length": max_len,
40
+ "num_beams": model.config.beam_width,
41
+ "length_penalty": repetition_penalty,
42
+ "num_return_sequences": model.config.num_return_sequences,
43
+ "early_stopping": model.config.early_stopping,
44
+ "eos_token_id": model.config.eos_token_id,
45
+ "pad_token_id": model.config.pad_token_id,
46
+ }
47
+
48
+ thread = Thread(target=model.model.generate, kwargs=generation_kwargs)
49
+ thread.start()
50
+ if echo:
51
+ # means keep the prompt
52
+ output = prompt
53
+ else:
54
+ output = ""
55
+ i = 0
56
+ for i, new_text in enumerate(streamer):
57
+ output += new_text
58
+ yield {
59
+ "text": output,
60
+ "usage": {
61
+ "prompt_tokens": input_echo_len,
62
+ "completion_tokens": i,
63
+ "total_tokens": input_echo_len + i,
64
+ },
65
+ "finish_reason": None,
66
+ }
67
+ output = output.strip()
68
+ if i == max_new_tokens - 1:
69
+ finish_reason = "length"
70
+ else:
71
+ finish_reason = "stop"
72
+ yield {
73
+ "text": output,
74
+ "usage": {
75
+ "prompt_tokens": input_echo_len,
76
+ "completion_tokens": i,
77
+ "total_tokens": input_echo_len + i,
78
+ },
79
+ "finish_reason": finish_reason,
80
+ }
81
+ gc.collect()
src/model/model_yuan2.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from threading import Thread
3
+ from typing import Iterable
4
+
5
+ import torch
6
+ import transformers
7
+ from transformers import TextIteratorStreamer, GenerationConfig
8
+
9
+ from fastchat.utils import is_partial_stop
10
+
11
+
12
+ @torch.inference_mode()
13
+ def generate_stream_yuan2(
14
+ model,
15
+ tokenizer,
16
+ params,
17
+ device,
18
+ context_len=2048,
19
+ stream_interval=2,
20
+ judge_sent_end=False,
21
+ ):
22
+ prompt = params["prompt"]
23
+ len_prompt = len(prompt)
24
+ temperature = float(params.get("temperature", 1))
25
+ repetition_penalty = float(params.get("repetition_penalty", 1.0))
26
+ top_p = float(params.get("top_p", 0))
27
+ top_k = int(params.get("top_k", 1)) # -1 means disable
28
+ max_new_tokens = int(params.get("max_new_tokens", 512))
29
+ stop_str = params.get("stop", "<eod>")
30
+ echo = bool(params.get("echo", True))
31
+ stop_token_ids = params.get("stop_token_ids", None) or []
32
+ stop_token_ids.append(tokenizer("<eod>")["input_ids"][0])
33
+
34
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
+ input_ids = inputs["input_ids"]
36
+ attention_mask = inputs["attention_mask"]
37
+
38
+ max_src_len = context_len - max_new_tokens - 8
39
+
40
+ input_ids = input_ids[-max_src_len:] # truncate from the left
41
+ attention_mask = attention_mask[-max_src_len:] # truncate from the left
42
+ input_echo_len = len(input_ids)
43
+
44
+ decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
45
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
46
+
47
+ generation_config = GenerationConfig(
48
+ max_new_tokens=max_new_tokens,
49
+ do_sample=temperature >= 1.2,
50
+ temperature=temperature,
51
+ repetition_penalty=repetition_penalty,
52
+ no_repeat_ngram_size=10,
53
+ top_p=top_p,
54
+ top_k=top_k,
55
+ )
56
+
57
+ generation_kwargs = dict(
58
+ inputs=input_ids,
59
+ attention_mask=attention_mask,
60
+ streamer=streamer,
61
+ generation_config=generation_config,
62
+ )
63
+
64
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
+ thread.start()
66
+
67
+ if echo:
68
+ # means keep the prompt
69
+ output = prompt
70
+ else:
71
+ output = ""
72
+
73
+ for i, new_text in enumerate(streamer):
74
+ output += new_text
75
+ if i % stream_interval == 0:
76
+ if echo:
77
+ rfind_start = len_prompt
78
+ else:
79
+ rfind_start = 0
80
+
81
+ partially_stopped = False
82
+ if stop_str:
83
+ if isinstance(stop_str, str):
84
+ pos = output.rfind(stop_str, rfind_start)
85
+ if pos != -1:
86
+ output = output[:pos]
87
+ else:
88
+ partially_stopped = is_partial_stop(output, stop_str)
89
+ elif isinstance(stop_str, Iterable):
90
+ for each_stop in stop_str:
91
+ pos = output.rfind(each_stop, rfind_start)
92
+ if pos != -1:
93
+ output = output[:pos]
94
+ break
95
+ else:
96
+ partially_stopped = is_partial_stop(output, each_stop)
97
+ if partially_stopped:
98
+ break
99
+ else:
100
+ raise ValueError("Invalid stop field type.")
101
+
102
+ # prevent yielding partial stop sequence
103
+ if not partially_stopped:
104
+ yield {
105
+ "text": output,
106
+ "usage": {
107
+ "prompt_tokens": input_echo_len,
108
+ "completion_tokens": i,
109
+ "total_tokens": input_echo_len + i,
110
+ },
111
+ "finish_reason": None,
112
+ }
113
+ output = output.strip()
114
+
115
+ # finish stream event, which contains finish reason
116
+ if i == max_new_tokens - 1:
117
+ finish_reason = "length"
118
+ elif partially_stopped:
119
+ finish_reason = None
120
+ else:
121
+ finish_reason = "stop"
122
+
123
+ yield {
124
+ "text": output,
125
+ "usage": {
126
+ "prompt_tokens": input_echo_len,
127
+ "completion_tokens": i,
128
+ "total_tokens": input_echo_len + i,
129
+ },
130
+ "finish_reason": finish_reason,
131
+ }
132
+
133
+ # clean
134
+ gc.collect()
135
+ torch.cuda.empty_cache()
136
+ if device == "xpu":
137
+ torch.xpu.empty_cache()
138
+ if device == "npu":
139
+ torch.npu.empty_cache()
src/model/monkey_patch_non_inplace.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Monkey patch the llama implementation in the huggingface/transformers library.
3
+ Avoid bugs in mps backend by not using in-place operations.
4
+ """
5
+ import math
6
+ from typing import List, Optional, Tuple
7
+
8
+ import torch
9
+ from torch import nn
10
+ import transformers
11
+
12
+
13
+ def rotate_half(x):
14
+ """Rotates half the hidden dims of the input."""
15
+ x1 = x[..., : x.shape[-1] // 2].clone()
16
+ x2 = x[..., x.shape[-1] // 2 :].clone()
17
+ return torch.cat((-x2, x1), dim=-1)
18
+
19
+
20
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
21
+ gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1]
22
+ gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
23
+ cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
24
+ sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
25
+ q_embed = (q * cos) + (rotate_half(q) * sin)
26
+ k_embed = (k * cos) + (rotate_half(k) * sin)
27
+ return q_embed, k_embed
28
+
29
+
30
+ def forward(
31
+ self,
32
+ hidden_states: torch.Tensor,
33
+ attention_mask: Optional[torch.Tensor] = None,
34
+ position_ids: Optional[torch.LongTensor] = None,
35
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
36
+ output_attentions: bool = False,
37
+ use_cache: bool = False,
38
+ padding_mask: Optional[torch.LongTensor] = None,
39
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
40
+ bsz, q_len, _ = hidden_states.size()
41
+
42
+ query_states = (
43
+ self.q_proj(hidden_states)
44
+ .view(bsz, q_len, self.num_heads, self.head_dim)
45
+ .transpose(1, 2)
46
+ )
47
+ key_states = (
48
+ self.k_proj(hidden_states)
49
+ .view(bsz, q_len, self.num_heads, self.head_dim)
50
+ .transpose(1, 2)
51
+ )
52
+ value_states = (
53
+ self.v_proj(hidden_states)
54
+ .view(bsz, q_len, self.num_heads, self.head_dim)
55
+ .transpose(1, 2)
56
+ )
57
+
58
+ kv_seq_len = key_states.shape[-2]
59
+ if past_key_value is not None:
60
+ kv_seq_len += past_key_value[0].shape[-2]
61
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
62
+ query_states, key_states = apply_rotary_pos_emb(
63
+ query_states, key_states, cos, sin, position_ids
64
+ )
65
+ # [bsz, nh, t, hd]
66
+
67
+ if past_key_value is not None:
68
+ # reuse k, v, self_attention
69
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
70
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
71
+
72
+ past_key_value = (key_states, value_states) if use_cache else None
73
+
74
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
75
+ self.head_dim
76
+ )
77
+
78
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
79
+ raise ValueError(
80
+ f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
81
+ f" {attn_weights.size()}"
82
+ )
83
+
84
+ if attention_mask is not None:
85
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
86
+ raise ValueError(
87
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
88
+ )
89
+ attn_weights = attn_weights + attention_mask
90
+ attn_weights = torch.max(
91
+ attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
92
+ )
93
+
94
+ # upcast attention to fp32
95
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
96
+ query_states.dtype
97
+ )
98
+ attn_output = torch.matmul(attn_weights, value_states)
99
+
100
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
101
+ raise ValueError(
102
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
103
+ f" {attn_output.size()}"
104
+ )
105
+
106
+ attn_output = attn_output.transpose(1, 2)
107
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
108
+
109
+ attn_output = self.o_proj(attn_output)
110
+
111
+ if not output_attentions:
112
+ attn_weights = None
113
+
114
+ return attn_output, attn_weights, past_key_value
115
+
116
+
117
+ def replace_llama_attn_with_non_inplace_operations():
118
+ """Avoid bugs in mps backend by not using in-place operations."""
119
+ transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
src/model/rwkv_model.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from types import SimpleNamespace
3
+ import warnings
4
+
5
+ import torch
6
+
7
+ os.environ["RWKV_JIT_ON"] = "1"
8
+ os.environ["RWKV_CUDA_ON"] = "1"
9
+
10
+ from rwkv.model import RWKV
11
+ from rwkv.utils import PIPELINE, PIPELINE_ARGS
12
+
13
+
14
+ class RwkvModel:
15
+ def __init__(self, model_path):
16
+ warnings.warn(
17
+ "Experimental support. Please use ChatRWKV if you want to chat with RWKV"
18
+ )
19
+ self.config = SimpleNamespace(is_encoder_decoder=False)
20
+ self.model = RWKV(model=model_path, strategy="cuda fp16")
21
+ # two GPUs
22
+ # self.model = RWKV(model=model_path, strategy="cuda:0 fp16 *20 -> cuda:1 fp16")
23
+
24
+ self.tokenizer = None
25
+ self.model_path = model_path
26
+
27
+ def to(self, target):
28
+ assert target == "cuda"
29
+
30
+ def __call__(self, input_ids, use_cache, past_key_values=None):
31
+ assert use_cache == True
32
+ input_ids = input_ids[0].detach().cpu().numpy()
33
+ # print(input_ids)
34
+ logits, state = self.model.forward(input_ids, past_key_values)
35
+ # print(logits)
36
+ logits = logits.unsqueeze(0).unsqueeze(0)
37
+ out = SimpleNamespace(logits=logits, past_key_values=state)
38
+ return out
39
+
40
+ def generate(
41
+ self, input_ids, do_sample, temperature, max_new_tokens, repetition_penalty=1.0
42
+ ):
43
+ # This function is used by fastchat.llm_judge.
44
+ # Because RWKV does not support huggingface generation API,
45
+ # we reuse fastchat.serve.inference.generate_stream as a workaround.
46
+ from transformers import AutoTokenizer
47
+
48
+ from fastchat.serve.inference import generate_stream
49
+ from fastchat.conversation import get_conv_template
50
+
51
+ if self.tokenizer is None:
52
+ self.tokenizer = AutoTokenizer.from_pretrained(
53
+ "EleutherAI/pythia-160m", use_fast=True
54
+ )
55
+ prompt = self.tokenizer.decode(input_ids[0].tolist())
56
+ conv = get_conv_template("rwkv")
57
+
58
+ gen_params = {
59
+ "model": self.model_path,
60
+ "prompt": prompt,
61
+ "temperature": temperature,
62
+ "repetition_penalty": repetition_penalty,
63
+ "max_new_tokens": max_new_tokens,
64
+ "stop": conv.stop_str,
65
+ "stop_token_ids": conv.stop_token_ids,
66
+ "echo": False,
67
+ }
68
+ res_iter = generate_stream(self, self.tokenizer, gen_params, "cuda")
69
+
70
+ for res in res_iter:
71
+ pass
72
+
73
+ output = res["text"]
74
+ output_ids = self.tokenizer.encode(output)
75
+
76
+ return [input_ids[0].tolist() + output_ids]
src/model/upload_hub.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload weights to huggingface.
3
+
4
+ Usage:
5
+ python3 -m fastchat.model.upload_hub --model-path ~/model_weights/vicuna-13b --hub-repo-id lmsys/vicuna-13b-v1.3
6
+ """
7
+ import argparse
8
+ import tempfile
9
+
10
+ import torch
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+
13
+
14
+ def upload_hub(model_path, hub_repo_id, component, private):
15
+ if component == "all":
16
+ components = ["model", "tokenizer"]
17
+ else:
18
+ components = [component]
19
+
20
+ kwargs = {"push_to_hub": True, "repo_id": hub_repo_id, "private": args.private}
21
+
22
+ if "model" in components:
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
25
+ )
26
+ with tempfile.TemporaryDirectory() as tmp_path:
27
+ model.save_pretrained(tmp_path, **kwargs)
28
+
29
+ if "tokenizer" in components:
30
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
31
+ with tempfile.TemporaryDirectory() as tmp_path:
32
+ tokenizer.save_pretrained(tmp_path, **kwargs)
33
+
34
+
35
+ if __name__ == "__main__":
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--model-path", type=str, required=True)
38
+ parser.add_argument("--hub-repo-id", type=str, required=True)
39
+ parser.add_argument(
40
+ "--component", type=str, choices=["all", "model", "tokenizer"], default="all"
41
+ )
42
+ parser.add_argument("--private", action="store_true")
43
+ args = parser.parse_args()
44
+
45
+ upload_hub(args.model_path, args.hub_repo_id, args.component, args.private)
src/modules/__init__.py ADDED
File without changes
src/modules/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (190 Bytes). View file
 
src/modules/__pycache__/awq.cpython-310.pyc ADDED
Binary file (2.87 kB). View file
 
src/modules/__pycache__/exllama.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
src/modules/__pycache__/gptq.cpython-310.pyc ADDED
Binary file (2.25 kB). View file
 
src/modules/__pycache__/xfastertransformer.cpython-310.pyc ADDED
Binary file (1.78 kB). View file
 
src/modules/awq.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ import sys
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, modeling_utils
7
+
8
+
9
+ @dataclass
10
+ class AWQConfig:
11
+ ckpt: str = field(
12
+ default=None,
13
+ metadata={
14
+ "help": "Load quantized model. The path to the local AWQ checkpoint."
15
+ },
16
+ )
17
+ wbits: int = field(default=16, metadata={"help": "#bits to use for quantization"})
18
+ groupsize: int = field(
19
+ default=-1,
20
+ metadata={"help": "Groupsize to use for quantization; default uses full row."},
21
+ )
22
+
23
+
24
+ def load_awq_quantized(model_name, awq_config: AWQConfig, device):
25
+ print("Loading AWQ quantized model...")
26
+
27
+ try:
28
+ from tinychat.utils import load_quant
29
+ from tinychat.modules import make_quant_norm, make_quant_attn, make_fused_mlp
30
+ except ImportError as e:
31
+ print(f"Error: Failed to import tinychat. {e}")
32
+ print("Please double check if you have successfully installed AWQ")
33
+ print("See https://github.com/lm-sys/FastChat/blob/main/docs/awq.md")
34
+ sys.exit(-1)
35
+
36
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
37
+ tokenizer = AutoTokenizer.from_pretrained(
38
+ model_name, use_fast=False, trust_remote_code=True
39
+ )
40
+
41
+ def skip(*args, **kwargs):
42
+ pass
43
+
44
+ torch.nn.init.kaiming_uniform_ = skip
45
+ torch.nn.init.kaiming_normal_ = skip
46
+ torch.nn.init.uniform_ = skip
47
+ torch.nn.init.normal_ = skip
48
+ modeling_utils._init_weights = False
49
+
50
+ torch.set_default_dtype(torch.half)
51
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
52
+
53
+ if any(name in find_awq_ckpt(awq_config) for name in ["llama", "vicuna"]):
54
+ model = load_quant.load_awq_llama_fast(
55
+ model,
56
+ find_awq_ckpt(awq_config),
57
+ awq_config.wbits,
58
+ awq_config.groupsize,
59
+ device,
60
+ )
61
+ make_quant_attn(model, device)
62
+ make_quant_norm(model)
63
+ make_fused_mlp(model)
64
+ else:
65
+ model = load_quant.load_awq_model(
66
+ model,
67
+ find_awq_ckpt(awq_config),
68
+ awq_config.wbits,
69
+ awq_config.groupsize,
70
+ device,
71
+ )
72
+ return model, tokenizer
73
+
74
+
75
+ def find_awq_ckpt(awq_config: AWQConfig):
76
+ if Path(awq_config.ckpt).is_file():
77
+ return awq_config.ckpt
78
+
79
+ for ext in ["*.pt", "*.safetensors"]:
80
+ matched_result = sorted(Path(awq_config.ckpt).glob(ext))
81
+ if len(matched_result) > 0:
82
+ return str(matched_result[-1])
83
+
84
+ print("Error: AWQ checkpoint not found")
85
+ sys.exit(1)