File size: 3,247 Bytes
abd40c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from configuration_dolphin import DolphinConfig
from modeling_dolphin import DolphinForCausalLM
from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig)
import torch
def inference_instruct(mycontext, question, device="cuda:0"):
import time
MEMORY_SIZE = 32
start_time = time.time()
generated_token_ids = []
prompt = f" <context>{question}"
text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
input_ids = (
torch.tensor(
text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
)
.unsqueeze(0)
.to(device)
)
# to process the context
context_tokenized = tokenizer(
mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
return_tensors="pt",
)
context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
# We conduct a inference process
for i in range(context_token_count):
next_token = (
model(
input_ids,
context_input_ids=context_tokenized["input_ids"],
context_attention_mask=context_tokenized["attention_mask"],
)
.logits[:, -1]
.argmax(-1)
)
if next_token.item() == 151643:
break
generated_token_ids.append(next_token.item())
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
result = tokenizer.decode(generated_token_ids)
print(f"Time taken: {time.time() - start_time}")
return result
if __name__ == "__main__":
# Register your configuration and model
AutoConfig.register("dolphin", DolphinConfig)
AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
# Run inference example
mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
question = "Who founded Nexa AI?"
# Pass the context and the correct device string
result = inference_instruct(mycontext, question, device=device_name)
print("Result:", result) |