File size: 7,453 Bytes

74936cb

#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#

"""Module to generate OpenELM output given a model and an input prompt."""
import os
import logging
import time
import argparse
from typing import Optional, Union
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM


def generate(
    prompt: str,
    model: Union[str, AutoModelForCausalLM],
    hf_access_token: str = None,
    tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
    device: Optional[str] = None,
    max_length: int = 1024,
    assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
    generate_kwargs: Optional[dict] = None,
) -> str:
    """ Generates output given a prompt.

    Args:
        prompt: The string prompt.
        model: The LLM Model. If a string is passed, it should be the path to
            the hf converted checkpoint.
        hf_access_token: Hugging face access token.
        tokenizer: Tokenizer instance. If model is set as a string path,
            the tokenizer will be loaded from the checkpoint.
        device: String representation of device to run the model on. If None
            and cuda available it would be set to cuda:0 else cpu.
        max_length: Maximum length of tokens, input prompt + generated tokens.
        assistant_model: If set, this model will be used for
            speculative generation. If a string is passed, it should be the
            path to the hf converted checkpoint.
        generate_kwargs: Extra kwargs passed to the hf generate function.

    Returns:
        output_text: output generated as a string.
        generation_time: generation time in seconds.

    Raises:
        ValueError: If device is set to CUDA but no CUDA device is detected.
        ValueError: If tokenizer is not set.
        ValueError: If hf_access_token is not specified.
    """
    if not device:
        if torch.cuda.is_available() and torch.cuda.device_count():
            device = "cuda:0"
            logging.warning(
                'inference device is not set, using cuda:0, %s',
                torch.cuda.get_device_name(0)
            )
        else:
            device = 'cpu'
            logging.warning(
                (
                    'No CUDA device detected, using cpu, '
                    'expect slower speeds.'
                )
            )

    if 'cuda' in device and not torch.cuda.is_available():
        raise ValueError('CUDA device requested but no CUDA device detected.')

    if not tokenizer:
        raise ValueError('Tokenizer is not set in the generate function.')

    if not hf_access_token:
        raise ValueError((
            'Hugging face access token needs to be specified. '
            'Please refer to https://huggingface.co/docs/hub/security-tokens'
            ' to obtain one.'
            )
        )

    if isinstance(model, str):
        checkpoint_path = model
        model = AutoModelForCausalLM.from_pretrained(
            checkpoint_path,
            trust_remote_code=True
        )
    model.to(device).eval()
    if isinstance(tokenizer, str):
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer,
            token=hf_access_token,
        )

    # Speculative mode
    draft_model = None
    if assistant_model:
        draft_model = assistant_model
        if isinstance(assistant_model, str):
            draft_model = AutoModelForCausalLM.from_pretrained(
                assistant_model,
                trust_remote_code=True
            )
        draft_model.to(device).eval()

    # Prepare the prompt
    tokenized_prompt = tokenizer(prompt)
    tokenized_prompt = torch.tensor(
        tokenized_prompt['input_ids'],
        device=device
    )

    tokenized_prompt = tokenized_prompt.unsqueeze(0)

    # Generate
    stime = time.time()
    output_ids = model.generate(
        tokenized_prompt,
        max_length=max_length,
        pad_token_id=0,
        assistant_model=draft_model,
        **(generate_kwargs if generate_kwargs else {}),
    )
    generation_time = time.time() - stime

    output_text = tokenizer.decode(
        output_ids[0].tolist(),
        skip_special_tokens=True
    )

    return output_text, generation_time


def openelm_generate_parser():
    """Argument Parser"""

    class KwargsParser(argparse.Action):
        """Parser action class to parse kwargs of form key=value"""
        def __call__(self, parser, namespace, values, option_string=None):
            setattr(namespace, self.dest, dict())
            for val in values:
                if '=' not in val:
                    raise ValueError(
                        (
                            'Argument parsing error, kwargs are expected in'
                            ' the form of key=value.'
                        )
                    )
                kwarg_k, kwarg_v = val.split('=')
                try:
                    converted_v = int(kwarg_v)
                except ValueError:
                    try:
                        converted_v = float(kwarg_v)
                    except ValueError:
                        converted_v = kwarg_v            
                getattr(namespace, self.dest)[kwarg_k] = converted_v

    parser = argparse.ArgumentParser('OpenELM Generate Module')
    parser.add_argument(
        '--model',
        dest='model',
        help='Path to the hf converted model.',
        required=True,
        type=str,
    )
    parser.add_argument(
        '--hf_access_token',
        dest='hf_access_token',
        help='Hugging face access token, starting with "hf_".',
        type=str,
    )
    parser.add_argument(
      '--prompt',
      dest='prompt',
      help='Prompt for LLM call.',
      default='',
      type=str,
    )
    parser.add_argument(
        '--device',
        dest='device',
        help='Device used for inference.',
        type=str,
    )
    parser.add_argument(
        '--max_length',
        dest='max_length',
        help='Maximum length of tokens.',
        default=256,
        type=int,
    )
    parser.add_argument(
        '--assistant_model',
        dest='assistant_model',
        help=(
            (
                'If set, this is used as a draft model '
                'for assisted speculative generation.'
            )
        ),
        type=str,
    )
    parser.add_argument(
        '--generate_kwargs',
        dest='generate_kwargs',
        help='Additional kwargs passed to the HF generate function.',
        type=str,
        nargs='*',
        action=KwargsParser,
    )
    return parser.parse_args()


if __name__ == '__main__':
    args = openelm_generate_parser()
    prompt = args.prompt

    output_text, genertaion_time = generate(
        prompt=prompt,
        model=args.model,
        device=args.device,
        max_length=args.max_length,
        assistant_model=args.assistant_model,
        generate_kwargs=args.generate_kwargs,
        hf_access_token=args.hf_access_token,
    )

    print_txt = (
        f'\r\n{"=" * os.get_terminal_size().columns}\r\n'
        '\033[1m Prompt + Generated Output\033[0m\r\n'
        f'{"-" * os.get_terminal_size().columns}\r\n'
        f'{output_text}\r\n'
        f'{"-" * os.get_terminal_size().columns}\r\n'
        '\r\nGeneration took'
        f'\033[1m\033[92m {round(genertaion_time, 2)} \033[0m'
        'seconds.\r\n'
    )
    print(print_txt)