pesi
/

Luigi commited on
Commit
889281f
·
1 Parent(s): 2f2f685

Implement pycuda backend for inference with TensorRT engine

Browse files

in additon to original polygraphy backend.
the default is polygraphy. You can set TRT_BACKEND to 'PYCUDA' to choose pycuda backend.

Files changed (1) hide show
  1. rtmo_gpu.py +125 -19
rtmo_gpu.py CHANGED
@@ -5,6 +5,7 @@ import onnxruntime as ort
5
  import cv2
6
  from queue import Queue
7
  os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
 
8
 
9
  # dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
10
  coco17 = dict(name='coco17',
@@ -442,17 +443,39 @@ class RTMO_GPU(object):
442
  outputs = [output.numpy() for output in io_binding.get_outputs()]
443
 
444
  else: # 'engine'
445
-
446
- if not self.session.is_active:
447
- self.session.activate()
448
-
449
- outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
450
- outputs = [output for output in outputs.values()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  return outputs
453
 
454
  def __exit__(self):
455
- if self.model_format == 'engine':
456
  if self.session.is_active:
457
  self.session.deactivate()
458
 
@@ -471,7 +494,11 @@ class RTMO_GPU(object):
471
  mean: tuple = None,
472
  std: tuple = None,
473
  device: str = 'cuda',
474
- is_yolo_nas_pose = False):
 
 
 
 
475
 
476
  if not os.path.exists(model):
477
  # If the file does not exist, raise FileNotFoundError
@@ -499,10 +526,62 @@ class RTMO_GPU(object):
499
  providers=providers[device])
500
 
501
  else: # 'engine'
502
- from polygraphy.backend.common import BytesFromPath
503
- from polygraphy.backend.trt import EngineFromBytes, TrtRunner
504
- engine = EngineFromBytes(BytesFromPath(model))
505
- self.session = TrtRunner(engine)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
507
  self.model_input_size = self.input_shape[2:4] # B, C, H, W,
508
  self.mean = mean
@@ -510,6 +589,9 @@ class RTMO_GPU(object):
510
  self.device = device
511
  self.is_yolo_nas_pose = is_yolo_nas_pose
512
 
 
 
 
513
  class RTMO_GPU_Batch(RTMO_GPU):
514
  def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
515
  """Process a batch of images for RTMPose model inference.
@@ -571,12 +653,34 @@ class RTMO_GPU_Batch(RTMO_GPU):
571
  outputs = [output.numpy() for output in io_binding.get_outputs()]
572
 
573
  else: # 'engine'
 
 
 
574
 
575
- if not self.session.is_active:
576
- self.session.activate()
577
-
578
- outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
579
- outputs = [output for output in outputs.values()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
  return outputs
582
 
@@ -651,14 +755,16 @@ class RTMO_GPU_Batch(RTMO_GPU):
651
  std: tuple = None,
652
  device: str = 'cuda',
653
  is_yolo_nas_pose = False,
 
654
  batch_size: int = 1):
655
  super().__init__(model,
656
  mean,
657
  std,
658
  device,
659
- is_yolo_nas_pose)
 
 
660
 
661
- self.batch_size = batch_size
662
  self.in_queues = dict()
663
  self.out_queues = dict()
664
  self.buffers = dict()
 
5
  import cv2
6
  from queue import Queue
7
  os.environ['ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS']='libmmdeploy_tensorrt_ops.so'
8
+ TRT_BACKEND='POLYGRAPHY'
9
 
10
  # dictionary from https://github.com/Tau-J/rtmlib/blob/4b29101d54b611048ef165277cebfffff3030074/rtmlib/visualization/skeleton/coco17.py
11
  coco17 = dict(name='coco17',
 
443
  outputs = [output.numpy() for output in io_binding.get_outputs()]
444
 
445
  else: # 'engine'
446
+ if TRT_BACKEND == 'POLYGRAPHY':
447
+ if not self.session.is_active:
448
+ self.session.activate()
449
+
450
+ outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
451
+ outputs = [output for output in outputs.values()]
452
+ else: # PYCUDA
453
+ import pycuda.driver as cuda
454
+ # Set the input shape dynamically
455
+ input_shape = input.shape
456
+ self.context.set_binding_shape(0, input_shape)
457
+
458
+ # Ensure input_data matches the expected shape
459
+ np.copyto(self.inputs[0]['host'], input.ravel())
460
+ cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
461
+
462
+ # Run inference
463
+ self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
464
+
465
+ # Transfer predictions back from the GPU
466
+ for output in self.outputs:
467
+ cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
468
+
469
+ # Synchronize the stream
470
+ self.stream.synchronize()
471
+
472
+ # Return only the output values (in their original shapes)
473
+ outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
474
 
475
  return outputs
476
 
477
  def __exit__(self):
478
+ if self.model_format == 'engine' and TRT_BACKEND == 'POLYGRAPHY':
479
  if self.session.is_active:
480
  self.session.deactivate()
481
 
 
494
  mean: tuple = None,
495
  std: tuple = None,
496
  device: str = 'cuda',
497
+ is_yolo_nas_pose = False,
498
+ batch_size = 1,
499
+ plugin_path = 'libmmdeploy_tensorrt_ops.so'):
500
+
501
+ self.batch_size = batch_size
502
 
503
  if not os.path.exists(model):
504
  # If the file does not exist, raise FileNotFoundError
 
526
  providers=providers[device])
527
 
528
  else: # 'engine'
529
+ if TRT_BACKEND == 'POLYGRAPHY':
530
+ from polygraphy.backend.common import BytesFromPath
531
+ from polygraphy.backend.trt import EngineFromBytes, TrtRunner
532
+ engine = EngineFromBytes(BytesFromPath(model))
533
+ self.session = TrtRunner(engine)
534
+ else: # PYCUDA
535
+ import tensorrt as trt
536
+ import ctypes
537
+ import pycuda.autoinit
538
+ import pycuda.driver as cuda
539
+ self.TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
540
+ self.trt_model_path = model
541
+ self.plugin_path = plugin_path
542
+
543
+ # Load the custom plugin library
544
+ ctypes.CDLL(self.plugin_path)
545
+
546
+ # Load the TensorRT engine
547
+ with open(self.trt_model_path, 'rb') as f:
548
+ engine_data = f.read()
549
+
550
+ self.runtime = trt.Runtime(self.TRT_LOGGER)
551
+ self.engine = self.runtime.deserialize_cuda_engine(engine_data)
552
+
553
+ if self.engine is None:
554
+ raise RuntimeError("Failed to load the engine.")
555
+
556
+ self.context = self.engine.create_execution_context()
557
+
558
+ self.inputs = []
559
+ self.outputs = []
560
+ self.bindings = []
561
+ self.stream = cuda.Stream()
562
+
563
+ # Allocate memory for inputs and outputs
564
+ for binding in self.engine:
565
+ binding_index = self.engine.get_binding_index(binding)
566
+ shape = self.engine.get_binding_shape(binding_index)
567
+ if shape[0] == -1:
568
+ # Handle dynamic batch size by setting max_batch_size
569
+ shape[0] = self.batch_size
570
+ size = trt.volume(shape)
571
+ dtype = trt.nptype(self.engine.get_binding_dtype(binding))
572
+
573
+ # Allocate host and device buffers
574
+ host_mem = cuda.pagelocked_empty(size, dtype)
575
+ device_mem = cuda.mem_alloc(host_mem.nbytes)
576
+
577
+ # Append the device buffer to device bindings.
578
+ self.bindings.append(int(device_mem))
579
+
580
+ # Append to the appropriate list.
581
+ if self.engine.binding_is_input(binding):
582
+ self.inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
583
+ else:
584
+ self.outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape})
585
 
586
  self.model_input_size = self.input_shape[2:4] # B, C, H, W,
587
  self.mean = mean
 
589
  self.device = device
590
  self.is_yolo_nas_pose = is_yolo_nas_pose
591
 
592
+ print(f'[I] Detected \'{self.model_format.upper()}\' model', end='')
593
+ print(f', \'{TRT_BACKEND.upper()}\' backend is chosen for inference' if self.model_format == 'engine' else '')
594
+
595
  class RTMO_GPU_Batch(RTMO_GPU):
596
  def preprocess_batch(self, imgs: List[np.ndarray]) -> Tuple[np.ndarray, List[float]]:
597
  """Process a batch of images for RTMPose model inference.
 
653
  outputs = [output.numpy() for output in io_binding.get_outputs()]
654
 
655
  else: # 'engine'
656
+ if TRT_BACKEND == 'POLYGRAPHY':
657
+ if not self.session.is_active:
658
+ self.session.activate()
659
 
660
+ outputs = self.session.infer(feed_dict={'input': input}, check_inputs=False)
661
+ outputs = [output for output in outputs.values()]
662
+ else: # PYCUDA
663
+ import pycuda.driver as cuda
664
+ # Set the input shape dynamically
665
+ input_shape = input.shape
666
+ self.context.set_binding_shape(0, input_shape)
667
+
668
+ # Ensure input_data matches the expected shape
669
+ np.copyto(self.inputs[0]['host'], input.ravel())
670
+ cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream)
671
+
672
+ # Run inference
673
+ self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
674
+
675
+ # Transfer predictions back from the GPU
676
+ for output in self.outputs:
677
+ cuda.memcpy_dtoh_async(output['host'], output['device'], self.stream)
678
+
679
+ # Synchronize the stream
680
+ self.stream.synchronize()
681
+
682
+ # Return only the output values (in their original shapes)
683
+ outputs = [out['host'].reshape(out['shape']) for out in self.outputs]
684
 
685
  return outputs
686
 
 
755
  std: tuple = None,
756
  device: str = 'cuda',
757
  is_yolo_nas_pose = False,
758
+ plugin_path = 'libmmdeploy_tensorrt_ops.so',
759
  batch_size: int = 1):
760
  super().__init__(model,
761
  mean,
762
  std,
763
  device,
764
+ is_yolo_nas_pose,
765
+ batch_size,
766
+ plugin_path)
767
 
 
768
  self.in_queues = dict()
769
  self.out_queues = dict()
770
  self.buffers = dict()