|
|
""" |
|
|
TensorRT Inference Example for WayraPPL |
|
|
Requires A100 GPU with TensorRT 10.13+ |
|
|
""" |
|
|
|
|
|
import tensorrt as trt |
|
|
import pycuda.driver as cuda |
|
|
import pycuda.autoinit |
|
|
import numpy as np |
|
|
from transformers import AutoTokenizer |
|
|
import torch |
|
|
import time |
|
|
|
|
|
class WayraPPLTensorRT: |
|
|
def __init__(self, engine_path: str): |
|
|
|
|
|
trt_logger = trt.Logger(trt.Logger.INFO) |
|
|
runtime = trt.Runtime(trt_logger) |
|
|
|
|
|
with open(engine_path, 'rb') as f: |
|
|
engine_data = f.read() |
|
|
|
|
|
self.engine = runtime.deserialize_cuda_engine(engine_data) |
|
|
self.context = self.engine.create_execution_context() |
|
|
self.stream = cuda.Stream() |
|
|
|
|
|
def infer(self, input_ids: np.ndarray, attention_mask: np.ndarray): |
|
|
batch_size, seq_len = input_ids.shape |
|
|
|
|
|
|
|
|
self.context.set_input_shape("input_ids", input_ids.shape) |
|
|
self.context.set_input_shape("attention_mask", attention_mask.shape) |
|
|
|
|
|
|
|
|
d_input_ids = cuda.mem_alloc(input_ids.nbytes) |
|
|
d_attention_mask = cuda.mem_alloc(attention_mask.nbytes) |
|
|
|
|
|
|
|
|
cuda.memcpy_htod_async(d_input_ids, input_ids.astype(np.int64), self.stream) |
|
|
cuda.memcpy_htod_async(d_attention_mask, attention_mask.astype(np.int64), self.stream) |
|
|
|
|
|
|
|
|
outputs = {} |
|
|
device_outputs = {} |
|
|
|
|
|
for i in range(self.engine.num_io_tensors): |
|
|
tensor_name = self.engine.get_tensor_name(i) |
|
|
if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.OUTPUT: |
|
|
output_shape = self.context.get_tensor_shape(tensor_name) |
|
|
if output_shape[0] == -1: |
|
|
output_shape = (batch_size,) + output_shape[1:] |
|
|
|
|
|
host_output = np.empty(output_shape, dtype=np.float32) |
|
|
device_output = cuda.mem_alloc(host_output.nbytes) |
|
|
|
|
|
outputs[tensor_name] = host_output |
|
|
device_outputs[tensor_name] = device_output |
|
|
self.context.set_tensor_address(tensor_name, int(device_output)) |
|
|
|
|
|
|
|
|
self.context.set_tensor_address("input_ids", int(d_input_ids)) |
|
|
self.context.set_tensor_address("attention_mask", int(d_attention_mask)) |
|
|
|
|
|
|
|
|
self.context.execute_async_v3(stream_handle=self.stream.handle) |
|
|
|
|
|
|
|
|
for tensor_name, host_output in outputs.items(): |
|
|
cuda.memcpy_dtoh_async(host_output, device_outputs[tensor_name], self.stream) |
|
|
|
|
|
self.stream.synchronize() |
|
|
|
|
|
|
|
|
d_input_ids.free() |
|
|
d_attention_mask.free() |
|
|
for device_output in device_outputs.values(): |
|
|
device_output.free() |
|
|
|
|
|
return outputs |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
model = WayraPPLTensorRT("wayrappl_fp16_bs2048.engine") |
|
|
tokenizer = AutoTokenizer.from_pretrained(".") |
|
|
|
|
|
|
|
|
texts = [ |
|
|
|
|
|
"La inteligencia artificial está transformando el mundo de manera profunda y acelerada.", |
|
|
"El análisis de datos permite descubrir patrones ocultos en grandes volúmenes de información.", |
|
|
|
|
|
|
|
|
"A tecnologia blockchain promete revolucionar sistemas financeiros tradicionais.", |
|
|
"Machine learning possibilita a automação de processos complexos em diversas indústrias.", |
|
|
|
|
|
|
|
|
"Natural language processing enables computers to understand human communication.", |
|
|
"Deep learning algorithms require massive computational resources for training." |
|
|
] |
|
|
|
|
|
|
|
|
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512) |
|
|
outputs = model.infer(inputs['input_ids'].numpy(), inputs['attention_mask'].numpy()) |
|
|
|
|
|
print("Perplexity scores:") |
|
|
for i, text in enumerate(texts): |
|
|
print(f"{text[:50]}... -> PPL: {outputs['ppl'][i]:.2f}") |
|
|
|
|
|
|
|
|
print(" |
|
|
" + "="*50) |
|
|
print("PERFORMANCE COMPARISON: 100K Examples") |
|
|
print("="*50) |
|
|
|
|
|
|
|
|
large_texts = texts * 16667 |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
batch_size = 2048 |
|
|
total_processed = 0 |
|
|
|
|
|
for i in range(0, len(large_texts), batch_size): |
|
|
batch = large_texts[i:i+batch_size] |
|
|
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512) |
|
|
outputs = model.infer(inputs['input_ids'].numpy(), inputs['attention_mask'].numpy()) |
|
|
total_processed += len(batch) |
|
|
|
|
|
tensorrt_time = time.time() - start_time |
|
|
tensorrt_throughput = total_processed / tensorrt_time |
|
|
|
|
|
print(f"TensorRT Results:") |
|
|
print(f" Time: {tensorrt_time:.2f} hours") |
|
|
print(f" Throughput: {tensorrt_throughput:.0f} samples/sec") |
|
|
print(f" Total processed: {total_processed:,} examples") |
|
|
|
|
|
|
|
|
pytorch_throughput = 1000 |
|
|
pytorch_time = total_processed / pytorch_throughput / 3600 |
|
|
|
|
|
print(f" |
|
|
PyTorch Estimated:") |
|
|
print(f" Time: {pytorch_time:.2f} hours") |
|
|
print(f" Throughput: {pytorch_throughput} samples/sec") |
|
|
|
|
|
speedup = pytorch_time / tensorrt_time |
|
|
print(f" |
|
|
Speedup: {speedup:.1f}x faster with TensorRT") |
|
|
|