Text Generation
Transformers
Safetensors
openelm
custom_code
mahyar-najibi commited on
Commit
771d259
1 Parent(s): 1186dc1

Updating generate_openelm.py and README.

Browse files
Files changed (2) hide show
  1. README.md +6 -5
  2. generate_openelm.py +38 -42
README.md CHANGED
@@ -20,16 +20,17 @@ We have provided an example function to generate output from OpenELM models load
20
 
21
  You can try the model by running the following command:
22
  ```
23
- python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10
24
  ```
 
25
 
26
- Additional arguments to the huggingface generate function can be passed via `generate_kwargs`. As an example, to speedup the inference, you can try [lookup token speculative generation](https://huggingface.co/docs/transformers/generation_strategies) by passing the `prompt_lookup_num_tokens` argument as follows:
27
  ```
28
- python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10 prompt_lookup_num_tokens=10
29
  ```
30
- Alternatively, model-wise speculative generation can be also tried by passing a smaller model checkpoint through the `speculative_model_ckpt` argument, for example:
31
  ```
32
- python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10 --speculative_model_ckpt apple/OpenELM-270M
33
  ```
34
 
35
 
 
20
 
21
  You can try the model by running the following command:
22
  ```
23
+ python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2
24
  ```
25
+ Please refer to [this link](https://huggingface.co/docs/hub/security-tokens) to obtain your hugging face access token.
26
 
27
+ Additional arguments to the hugging face generate function can be passed via `generate_kwargs`. As an example, to speedup the inference, you can try [lookup token speculative generation](https://huggingface.co/docs/transformers/generation_strategies) by passing the `prompt_lookup_num_tokens` argument as follows:
28
  ```
29
+ python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2 prompt_lookup_num_tokens=10
30
  ```
31
+ Alternatively, model-wise speculative generation with an [assistive model](https://huggingface.co/blog/assisted-generation) can be also tried by passing a smaller model model through the `assistant_model` argument, for example:
32
  ```
33
+ python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2 --assistant_model apple/OpenELM-270M
34
  ```
35
 
36
 
generate_openelm.py CHANGED
@@ -12,11 +12,11 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
12
  def generate(
13
  prompt: str,
14
  model: Union[str, AutoModelForCausalLM],
15
- hf_security_token: str = None,
16
  tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
17
  device: Optional[str] = None,
18
  max_length: int = 1024,
19
- speculative_model: Optional[Union[str, AutoModelForCausalLM]] = None,
20
  generate_kwargs: Optional[dict] = None,
21
  ) -> str:
22
  """ Generates output given a prompt.
@@ -25,16 +25,16 @@ def generate(
25
  prompt: The string prompt.
26
  model: The LLM Model. If a string is passed, it should be the path to
27
  the hf converted checkpoint.
28
- hf_security_token: Hugging face security token.
29
  tokenizer: Tokenizer instance. If model is set as a string path,
30
  the tokenizer will be loaded from the checkpoint.
31
  device: String representation of device to run the model on. If None
32
  and cuda available it would be set to cuda:0 else cpu.
33
  max_length: Maximum length of tokens, input prompt + generated tokens.
34
- speculative_model: If set, this model will be used for
35
  speculative generation. If a string is passed, it should be the
36
  path to the hf converted checkpoint.
37
- generate_kwargs: Extra kwargs passed to the generate function.
38
 
39
  Returns:
40
  output_text: output generated as a string.
@@ -42,9 +42,8 @@ def generate(
42
 
43
  Raises:
44
  ValueError: If device is set to CUDA but no CUDA device is detected.
45
- FileNotFoundError: If model or speculative_model are strings but
46
- the model paths do not exist.
47
- ValueError: If hf_security_token is not specified.
48
  """
49
  if not device:
50
  if torch.cuda.is_available() and torch.cuda.device_count():
@@ -55,28 +54,22 @@ def generate(
55
  )
56
  else:
57
  device = 'cpu'
58
- logging.warning('no CUDA device detected, using cpu, expect slower speeds.')
 
 
 
 
 
59
 
60
  if 'cuda' in device and not torch.cuda.is_available():
61
  raise ValueError('CUDA device requested but no CUDA device detected.')
62
 
63
- if isinstance(model, str) and (not model or not os.path.exists(model)):
64
- raise FileNotFoundError(f'Model checkpoint does not exist at {model}.')
65
-
66
- if (isinstance(speculative_model, str) and (
67
- not speculative_model and not os.path.exists(speculative_model))):
68
- raise FileNotFoundError(
69
- (
70
- 'Speculative checkpoint path does not exist at '
71
- f'{speculative_model}.'
72
- )
73
- )
74
- if not tokenizer and not isinstance(model, str):
75
  raise ValueError('Tokenizer is not set in the generate function.')
76
 
77
- if not hf_security_token:
78
  raise ValueError((
79
- 'Hugging face security key needs to be specified. '
80
  'Please refer to https://huggingface.co/docs/hub/security-tokens'
81
  ' to obtain one.'
82
  )
@@ -92,16 +85,16 @@ def generate(
92
  if isinstance(tokenizer, str):
93
  tokenizer = AutoTokenizer.from_pretrained(
94
  tokenizer,
95
- token=hf_security_token,
96
  )
97
 
98
  # Speculative mode
99
  draft_model = None
100
- if speculative_model:
101
- draft_model = speculative_model
102
- if isinstance(speculative_model, str):
103
  draft_model = AutoModelForCausalLM.from_pretrained(
104
- speculative_model,
105
  trust_remote_code=True
106
  )
107
  draft_model.to(device).eval()
@@ -161,22 +154,22 @@ def openelm_generate_parser():
161
 
162
  parser = argparse.ArgumentParser('OpenELM Generate Module')
163
  parser.add_argument(
164
- '--checkpoint',
165
- dest='checkpoint_path',
166
- help='Path to the model hf converted checkpoint.',
167
  required=True,
168
  type=str,
169
  )
170
  parser.add_argument(
171
- '--hf_security_token',
172
- dest='hf_security_token',
173
- help='HF security token, starting with "hf_".',
174
  type=str,
175
  )
176
  parser.add_argument(
177
  '--prompt',
178
  dest='prompt',
179
- help='Prompt for LLM call. Ignored if demo is True.',
180
  default='',
181
  type=str,
182
  )
@@ -194,17 +187,20 @@ def openelm_generate_parser():
194
  type=int,
195
  )
196
  parser.add_argument(
197
- '--speculative_model_ckpt',
198
- dest='speculative_model_ckpt',
199
  help=(
200
- 'If set, this is used as a draft model for speculative generation.'
 
 
 
201
  ),
202
  type=str,
203
  )
204
  parser.add_argument(
205
  '--generate_kwargs',
206
  dest='generate_kwargs',
207
- help='additional kwargs passed to the HF generate function.',
208
  type=str,
209
  nargs='*',
210
  action=KwargsParser,
@@ -218,12 +214,12 @@ if __name__ == '__main__':
218
 
219
  output_text, genertaion_time = generate(
220
  prompt=prompt,
221
- model=args.checkpoint_path,
222
  device=args.device,
223
  max_length=args.max_length,
224
- speculative_model=args.speculative_model_ckpt,
225
  generate_kwargs=args.generate_kwargs,
226
- hf_security_token=args.hf_security_token,
227
  )
228
 
229
  print_txt = (
 
12
  def generate(
13
  prompt: str,
14
  model: Union[str, AutoModelForCausalLM],
15
+ hf_access_token: str = None,
16
  tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
17
  device: Optional[str] = None,
18
  max_length: int = 1024,
19
+ assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
20
  generate_kwargs: Optional[dict] = None,
21
  ) -> str:
22
  """ Generates output given a prompt.
 
25
  prompt: The string prompt.
26
  model: The LLM Model. If a string is passed, it should be the path to
27
  the hf converted checkpoint.
28
+ hf_access_token: Hugging face access token.
29
  tokenizer: Tokenizer instance. If model is set as a string path,
30
  the tokenizer will be loaded from the checkpoint.
31
  device: String representation of device to run the model on. If None
32
  and cuda available it would be set to cuda:0 else cpu.
33
  max_length: Maximum length of tokens, input prompt + generated tokens.
34
+ assistant_model: If set, this model will be used for
35
  speculative generation. If a string is passed, it should be the
36
  path to the hf converted checkpoint.
37
+ generate_kwargs: Extra kwargs passed to the hf generate function.
38
 
39
  Returns:
40
  output_text: output generated as a string.
 
42
 
43
  Raises:
44
  ValueError: If device is set to CUDA but no CUDA device is detected.
45
+ ValueError: If tokenizer is not set.
46
+ ValueError: If hf_access_token is not specified.
 
47
  """
48
  if not device:
49
  if torch.cuda.is_available() and torch.cuda.device_count():
 
54
  )
55
  else:
56
  device = 'cpu'
57
+ logging.warning(
58
+ (
59
+ 'No CUDA device detected, using cpu, '
60
+ 'expect slower speeds.'
61
+ )
62
+ )
63
 
64
  if 'cuda' in device and not torch.cuda.is_available():
65
  raise ValueError('CUDA device requested but no CUDA device detected.')
66
 
67
+ if not tokenizer:
 
 
 
 
 
 
 
 
 
 
 
68
  raise ValueError('Tokenizer is not set in the generate function.')
69
 
70
+ if not hf_access_token:
71
  raise ValueError((
72
+ 'Hugging face access token needs to be specified. '
73
  'Please refer to https://huggingface.co/docs/hub/security-tokens'
74
  ' to obtain one.'
75
  )
 
85
  if isinstance(tokenizer, str):
86
  tokenizer = AutoTokenizer.from_pretrained(
87
  tokenizer,
88
+ token=hf_access_token,
89
  )
90
 
91
  # Speculative mode
92
  draft_model = None
93
+ if assistant_model:
94
+ draft_model = assistant_model
95
+ if isinstance(assistant_model, str):
96
  draft_model = AutoModelForCausalLM.from_pretrained(
97
+ assistant_model,
98
  trust_remote_code=True
99
  )
100
  draft_model.to(device).eval()
 
154
 
155
  parser = argparse.ArgumentParser('OpenELM Generate Module')
156
  parser.add_argument(
157
+ '--model',
158
+ dest='model',
159
+ help='Path to the hf converted model.',
160
  required=True,
161
  type=str,
162
  )
163
  parser.add_argument(
164
+ '--hf_access_token',
165
+ dest='hf_access_token',
166
+ help='Hugging face access token, starting with "hf_".',
167
  type=str,
168
  )
169
  parser.add_argument(
170
  '--prompt',
171
  dest='prompt',
172
+ help='Prompt for LLM call.',
173
  default='',
174
  type=str,
175
  )
 
187
  type=int,
188
  )
189
  parser.add_argument(
190
+ '--assistant_model',
191
+ dest='assistant_model',
192
  help=(
193
+ (
194
+ 'If set, this is used as a draft model '
195
+ 'for assisted speculative generation.'
196
+ )
197
  ),
198
  type=str,
199
  )
200
  parser.add_argument(
201
  '--generate_kwargs',
202
  dest='generate_kwargs',
203
+ help='Additional kwargs passed to the HF generate function.',
204
  type=str,
205
  nargs='*',
206
  action=KwargsParser,
 
214
 
215
  output_text, genertaion_time = generate(
216
  prompt=prompt,
217
+ model=args.model,
218
  device=args.device,
219
  max_length=args.max_length,
220
+ assistant_model=args.assistant_model,
221
  generate_kwargs=args.generate_kwargs,
222
+ hf_access_token=args.hf_access_token,
223
  )
224
 
225
  print_txt = (