Project

tensorrt

0.0
The project is in a healthy, maintained state
Minimal Ruby bindings for NVIDIA TensorRT inference using Rice
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
 Dependencies

Development

~> 13.0

Runtime

~> 1.0
>= 4.7
 Project Readme

tensorrt-rb

Minimal TensorRT bindings for Ruby using Rice (C++ bindings).

Requirements

  • Linux (x86_64 or aarch64)
  • Ruby >= 3.0
  • TensorRT (with headers and libraries)
  • CUDA runtime
  • Rice gem (gem install rice)

Installation

cd tensorrt-rb

# Set paths if not in standard locations
export TENSORRT_INCLUDE=/path/to/tensorrt/include
export TENSORRT_LIB=/path/to/tensorrt/lib
export CUDA_INCLUDE=/usr/local/cuda/include
export CUDA_LIB=/usr/local/cuda/lib64

# Build
rake compile

# Or install as gem
gem build tensorrt.gemspec
gem install tensorrt-*.gem

Default Library Paths

x86_64:

  • TensorRT: /usr/include/x86_64-linux-gnu, /usr/lib/x86_64-linux-gnu
  • CUDA: /usr/local/cuda/include, /usr/local/cuda/lib64

aarch64:

  • TensorRT: /usr/include/aarch64-linux-gnu, /usr/lib/aarch64-linux-gnu
  • CUDA: /usr/local/cuda/include, /usr/local/cuda/lib64

API

TensorRT::Engine

engine = TensorRT::Engine.new(path, verbose: false)

# Tensor info
engine.num_io_tensors                  # Number of input/output tensors
engine.get_tensor_name(index)          # Tensor name by index
engine.is_input?(name)                 # Check if tensor is input
engine.get_tensor_shape(name)          # Shape as array [1, 3, 640, 640]
engine.get_tensor_bytes(name)          # Size in bytes

# Memory binding
engine.set_tensor_address(name, device_ptr)

# Inference
engine.execute                         # Synchronous (blocking)
engine.enqueue                         # Asynchronous (non-blocking)

# Stream management
engine.get_stream                      # CUDA stream handle (uint64)
engine.stream_synchronize              # Wait for stream completion

TensorRT::CUDA

# Memory allocation
ptr = TensorRT::CUDA.malloc(bytes)
TensorRT::CUDA.free(ptr)

# Synchronous copy
TensorRT::CUDA.memcpy_htod(device_ptr, host_ptr, bytes)  # Host → Device
TensorRT::CUDA.memcpy_dtoh(host_ptr, device_ptr, bytes)  # Device → Host

# Asynchronous copy
TensorRT::CUDA.memcpy_htod_async(device_ptr, host_ptr, bytes, stream)
TensorRT::CUDA.memcpy_dtoh_async(host_ptr, device_ptr, bytes, stream)

# Synchronization
TensorRT::CUDA.synchronize                    # All operations
TensorRT::CUDA.stream_synchronize(stream)     # Specific stream

Examples

Synchronous Inference

require "tensorrt"

engine = TensorRT::Engine.new("model.engine")

# Allocate GPU memory
input_bytes = engine.get_tensor_bytes("input")
output_bytes = engine.get_tensor_bytes("output")
output_size = engine.get_tensor_shape("output").reduce(1, :*)

input_device = TensorRT::CUDA.malloc(input_bytes)
output_device = TensorRT::CUDA.malloc(output_bytes)
engine.set_tensor_address("input", input_device)
engine.set_tensor_address("output", output_device)

# Prepare input data
input_data = preprocess_image(image_path)  # Returns Numo::SFloat
input_host = FFI::MemoryPointer.new(:float, input_data.size)
input_host.write_bytes(input_data.to_binary)

# Copy input to GPU
TensorRT::CUDA.memcpy_htod(input_device, input_host, input_bytes)

# Run inference
engine.execute

# Copy output from GPU
output_host = FFI::MemoryPointer.new(:float, output_size)
TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
output_data = output_host.read_array_of_float(output_size)

# Cleanup
TensorRT::CUDA.free(input_device)
TensorRT::CUDA.free(output_device)

Pipelined Async Inference

Overlap CPU preprocessing with GPU inference for maximum throughput:

require "tensorrt"

engine = TensorRT::Engine.new("model.engine")
stream = engine.get_stream

# Allocate GPU memory
input_bytes = engine.get_tensor_bytes("input")
output_bytes = engine.get_tensor_bytes("output")
output_size = engine.get_tensor_shape("output").reduce(1, :*)

input_device = TensorRT::CUDA.malloc(input_bytes)
output_device = TensorRT::CUDA.malloc(output_bytes)
engine.set_tensor_address("input", input_device)
engine.set_tensor_address("output", output_device)

# Allocate host buffers
input_host = FFI::MemoryPointer.new(:float, input_bytes / 4)
output_host = FFI::MemoryPointer.new(:float, output_size)

# Preload first image
current_image = preprocess_image(images[0])

images.each_with_index do |image_path, i|
  # Copy current image to GPU (async)
  input_host.write_bytes(current_image.to_binary)
  TensorRT::CUDA.memcpy_htod_async(input_device, input_host, input_bytes, stream)

  # Start async inference
  engine.enqueue

  # Preprocess next image on CPU while GPU is busy
  next_image = preprocess_image(images[i + 1]) if i < images.size - 1

  # Wait for GPU inference to complete
  engine.stream_synchronize

  # Copy output from GPU
  TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
  output_data = output_host.read_array_of_float(output_size)

  # Process results
  process_detections(output_data)

  current_image = next_image
end

# Cleanup
TensorRT::CUDA.free(input_device)
TensorRT::CUDA.free(output_device)