BCenti commited on
Commit
63b3bac
1 Parent(s): 42c626f

Upload README.md

Browse files
Files changed (1) hide show
  1. README.md +183 -0
README.md CHANGED
@@ -1,3 +1,186 @@
1
  ---
2
  license: creativeml-openrail-m
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: creativeml-openrail-m
3
  ---
4
+
5
+ This model is an ONNX Export of [Pygmalion-6b](https://huggingface.co/PygmalionAI/pygmalion-6b), all credits should go to PygmalionAI.
6
+
7
+ Be warned, this ONNX Export is not fully accurate and it is upscaled to Float32 due to limitation of PyTorch ONNX Export, this is going to occupy twice the memory as original Pygmalion AI Model, the purpose of this export is to obtain
8
+ a list of operators and nodes that can then be used to run inference on Pygmalion 6b model on Vulkan Compute eventually which would enable a no BS inference with quantization
9
+ on INT8 or INT4 while compatible on almost any devices out of the box that supports Vulkan Compute.
10
+
11
+ Here are the following scripts, model.py is obtained from [PygmalioniAI/gradio-ui](https://github.com/PygmalionAI/gradio-ui) and is Licensed under GNU Affero General Public License v3.0.
12
+ In respect to that license, all scripts listed below are under GNU Affero General Public License v3.0.
13
+
14
+
15
+ **export.py**
16
+ ```py
17
+ import torch
18
+ import onnx
19
+ import transformers
20
+ import typing as t
21
+
22
+ model_name = "PygmalionAI/pygmalion-6b"
23
+ from model import build_model_and_tokenizer_for, run_raw_inference
24
+ model, tokenizer = build_model_and_tokenizer_for(model_name)
25
+ model.to('cpu').float()
26
+
27
+ input_layer = model.get_input_embeddings()
28
+ output_layer = model.get_output_embeddings()
29
+
30
+ # Load PyTorch model from .pth file
31
+ #model = AutoModelForCausalLM.from_pretrained("PygmalionAI/pygmalion-6b")
32
+
33
+ #state_dict = torch.load('pygmalion-6b.pth')
34
+
35
+ #model.load_state_dict(state_dict)
36
+
37
+ # Export PyTorch model to ONNX format
38
+ # Encode some input text
39
+ input_text = "Hello, how are you today?"
40
+ encoded_input = tokenizer.encode(input_text, return_tensors='pt')
41
+
42
+ # Export the tokenizer to ONNX format
43
+ print(f"Raw: {input_text}")
44
+ print(f"Encoded: {encoded_input}")
45
+
46
+ output_path = "onnx/pygmalion-6b.onnx"
47
+ dummy_input = torch.zeros((1, 10), dtype=torch.long)
48
+ input_names = ["input_ids"]
49
+ output_names = ["output"]
50
+ dynamic_axes = {"input_ids": {0: "batch_size", 1: "sequence_length"},
51
+ "output": {0: "batch_size", 1: "sequence_length"}}
52
+ torch.onnx.export(model, dummy_input, output_path, input_names=input_names,
53
+ output_names=output_names, dynamic_axes=dynamic_axes,
54
+ opset_version=12)
55
+ ```
56
+
57
+ **model.py**
58
+ ```py
59
+ import logging
60
+ import typing as t
61
+
62
+ import torch
63
+ import transformers
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+
68
+ def build_model_and_tokenizer_for(
69
+ model_name: str
70
+ ) -> t.Tuple[transformers.AutoModelForCausalLM, transformers.AutoTokenizer]:
71
+ '''Sets up the model and accompanying objects.'''
72
+ logger.info(f"Loading tokenizer for {model_name}")
73
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
74
+
75
+ # NOTE(11b): non-OPT models support passing this in at inference time, might
76
+ # be worth refactoring for a debug version so we're able to experiment on
77
+ # the fly
78
+ bad_words_ids = [
79
+ tokenizer(bad_word, add_special_tokens=False).input_ids
80
+ for bad_word in _build_bad_words_list_for(model_name)
81
+ ]
82
+
83
+ logger.info(f"Loading the {model_name} model")
84
+ model = transformers.AutoModelForCausalLM.from_pretrained(
85
+ model_name, bad_words_ids=bad_words_ids)
86
+ model.eval().to("cpu")
87
+
88
+ logger.info("Model and tokenizer are ready")
89
+ return model, tokenizer
90
+
91
+ def build_tokenizer_for(
92
+ model_name: str
93
+ ) -> t.Tuple[transformers.AutoTokenizer]:
94
+ '''Sets up the model and accompanying objects.'''
95
+ logger.info(f"Loading tokenizer for {model_name}")
96
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
97
+
98
+ # NOTE(11b): non-OPT models support passing this in at inference time, might
99
+ # be worth refactoring for a debug version so we're able to experiment on
100
+ # the fly
101
+ bad_words_ids = [
102
+ tokenizer(bad_word, add_special_tokens=False).input_ids
103
+ for bad_word in _build_bad_words_list_for(model_name)
104
+ ]
105
+
106
+ return tokenizer
107
+
108
+
109
+ def run_raw_inference(model: transformers.AutoModelForCausalLM,
110
+ tokenizer: transformers.AutoTokenizer, prompt: str,
111
+ user_message: str, **kwargs: t.Any) -> str:
112
+ '''
113
+ Runs inference on the model, and attempts to returns only the newly
114
+ generated text.
115
+
116
+ :param model: Model to perform inference with.
117
+ :param tokenizer: Tokenizer to tokenize input with.
118
+ :param prompt: Input to feed to the model.
119
+ :param user_message: The user's raw message, exactly as appended to the end
120
+ of `prompt`. Used for trimming the original input from the model output.
121
+ :return: Decoded model generation.
122
+ '''
123
+ tokenized_items = tokenizer(prompt, return_tensors="pt").to("cpu")
124
+
125
+ # Atrocious code to stop generation when the model outputs "\nYou: " in
126
+ # freshly generated text. Feel free to send in a PR if you know of a
127
+ # cleaner way to do this.
128
+ stopping_criteria_list = transformers.StoppingCriteriaList([
129
+ _SentinelTokenStoppingCriteria(
130
+ sentinel_token_ids=tokenizer(
131
+ "\nYou:",
132
+ add_special_tokens=False,
133
+ return_tensors="pt",
134
+ ).input_ids.to("cpu"),
135
+ starting_idx=tokenized_items.input_ids.shape[-1])
136
+ ])
137
+
138
+ logits = model.generate(stopping_criteria=stopping_criteria_list,
139
+ **tokenized_items,
140
+ **kwargs)
141
+ output = tokenizer.decode(logits[0], skip_special_tokens=True)
142
+
143
+ logger.debug("Before trimming, model output was: `%s`", output)
144
+
145
+ # Trim out the input prompt from the generated output.
146
+ if (idx := prompt.rfind(user_message)) != -1:
147
+ trimmed_output = output[idx + len(user_message) - 1:].strip()
148
+ logger.debug("After trimming, it became: `%s`", trimmed_output)
149
+
150
+ return trimmed_output
151
+ else:
152
+ raise Exception(
153
+ "Couldn't find user message in the model's output. What?")
154
+
155
+
156
+ def _build_bad_words_list_for(_model_name: str) -> t.List[str]:
157
+ '''Builds a list of bad words for the given model.'''
158
+
159
+ # NOTE(11b): This was implemented as a function because each model size
160
+ # seems to have it quirks at the moment, but this is a rushed implementation
161
+ # so I'm not handling that, hence the dumb return here.
162
+ return ["Persona:", "Scenario:", "<START>"]
163
+
164
+
165
+ #class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
166
+
167
+ # def __init__(self, sentinel_token_ids: torch.LongTensor,
168
+ # starting_idx: int):
169
+ # transformers.StoppingCriteria.__init__(self)
170
+ # self.sentinel_token_ids = sentinel_token_ids
171
+ # self.starting_idx = starting_idx
172
+
173
+ # def __call__(self, input_ids: torch.LongTensor,
174
+ # _scores: torch.FloatTensor) -> bool:
175
+ # for sample in input_ids:
176
+ # trimmed_sample = sample[self.starting_idx:]
177
+ # # Can't unfold, output is still too tiny. Skip.
178
+ # if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
179
+ # continue
180
+
181
+ # for window in trimmed_sample.unfold(
182
+ # 0, self.sentinel_token_ids.shape[-1], 1):
183
+ # if torch.all(torch.eq(self.sentinel_token_ids, window)):
184
+ # return True
185
+ # return False
186
+ ```