import time import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling if torch.cuda.is_available(): print("Cuda is available") # base_model_id = "microsoft/phi-2" # base_model_id = "abacaj/phi-2-super" base_model_id = "./results" tokenizer = AutoTokenizer.from_pretrained(base_model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("pad_token was missing and has been set to eos_token") tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ bos_token + 'Instruct: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'Output: ' + message['content'] + eos_token }}{% endif %}{% endfor %}" model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16).to('cuda') print(model) meta_messages = [ [ { "role": "user", "content": "You are an AI assistant that will be answering phone calls from participants of the Nowhere event, a regional Burning Man event in Spain. The phone the participant is using will be on-site at or near the Oasis Playground barrio. Your answer will be short and to the point. Conversation with the participant will be solely through voice prompts, with the use of speech-to-text and text-to-speech software. You as the assistent will provide in your answers the correct hippie-like vibe for this type of event." }, { "role": "assistant", "content": "Ofcourse, hippie, I will try my best for you!" }, ], [ { "role": "user", "content": "Hello, who are you?" } ], [ { "role": "user", "content": "Where are we?" } ], [ { "role": "user", "content": "What can I do here?" } ], [ { "role": "user", "content": "It is so hot I am getting an headache!" } ], [ { "role": "user", "content": "How do I use the toilets?" } ], [ { "role": "user", "content": "What is a Nobody?" } ], ] with torch.no_grad(): for messages in meta_messages: for msg in messages: print(f"{msg['role']}: {msg['content']}") add_generation_prompt = True if len(messages) == 2: add_generation_prompt = False inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, return_tensors="pt").to(model.device) input_ids_cutoff = inputs.size(dim=1) start_time = time.time() generated_ids = model.generate( input_ids=inputs, use_cache=True, max_new_tokens=512, temperature=0.2, top_p=0.95, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) duration = float(time.time() - start_time) generated = generated_ids[0][input_ids_cutoff:] completion = tokenizer.decode( generated, skip_special_tokens=True, ) print(f"assistant: {completion} | {len(generated)} tokens, {round(len(generated)/duration, 3)} tokens/sec")