Skip to main content

Overview

The Device class abstracts computational hardware, allowing tensors and models to run on different devices without code changes. Neurenix automatically detects available hardware and provides a unified API across all backends.
from neurenix.device import Device, DeviceType, get_available_devices

# Create device instances
cpu = Device(DeviceType.CPU)
gpu = Device(DeviceType.CUDA, 0)  # First CUDA GPU
tpu = Device(DeviceType.TPU, 0)   # First TPU

# List all available devices
devices = get_available_devices()
for device in devices:
    print(device)

Device Types

Neurenix supports a comprehensive range of hardware backends through the DeviceType enum:
from neurenix.device import Device, DeviceType

# CPU - Always available
cpu = Device(DeviceType.CPU)

# NVIDIA CUDA - For NVIDIA GPUs
cuda = Device(DeviceType.CUDA, 0)

# AMD ROCm - For AMD GPUs
rocm = Device(DeviceType.ROCM, 0)

# Google TPU - For Cloud TPU
tpu = Device(DeviceType.TPU, 0)

Creating Devices

From DeviceType

from neurenix.device import Device, DeviceType

# Create with default index (0)
cpu = Device(DeviceType.CPU)
cuda_0 = Device(DeviceType.CUDA)  # Equivalent to CUDA:0

# Create with specific index
cuda_1 = Device(DeviceType.CUDA, 1)  # Second GPU
cuda_2 = Device(DeviceType.CUDA, 2)  # Third GPU

From String

from neurenix.device import get_device

# Parse device strings
cpu = get_device("cpu")
cuda_0 = get_device("cuda:0")
cuda_1 = get_device("cuda:1")
tpu = get_device("tpu:0")
rocm = get_device("rocm:0")
Device strings follow the format type:index, where the index defaults to 0 if omitted.

Device Properties

from neurenix.device import Device, DeviceType

device = Device(DeviceType.CUDA, 1)

# Get device type
print(device.type)        # DeviceType.CUDA

# Get device index
print(device.index)       # 1

# Get device name
print(device.name)        # "CUDA:1"

# String representation
print(repr(device))       # "Device(CUDA:1)"
print(str(device))        # "Device(CUDA:1)"

Hardware Detection

Checking Device Availability

from neurenix.device import get_device_count, DeviceType

# Count devices by type
cuda_count = get_device_count(DeviceType.CUDA)
rocm_count = get_device_count(DeviceType.ROCM)
tpu_count = get_device_count(DeviceType.TPU)

print(f"CUDA devices: {cuda_count}")
print(f"ROCm devices: {rocm_count}")
print(f"TPU devices: {tpu_count}")

Listing Available Devices

from neurenix.device import get_available_devices

devices = get_available_devices()

print(f"Found {len(devices)} devices:")
for device in devices:
    print(f"  - {device.name}")

Total Device Count

from neurenix.device import Device

total = Device.device_count()
print(f"Total devices available: {total}")

Using Devices with Tensors

Creating Tensors on Specific Devices

from neurenix.tensor import Tensor
from neurenix.device import Device, DeviceType

# Create directly on GPU
gpu_tensor = Tensor.randn((100, 100), device=Device(DeviceType.CUDA, 0))

# Create on CPU then move to GPU
cpu_tensor = Tensor.randn((100, 100))
gpu_tensor = cpu_tensor.to(Device(DeviceType.CUDA, 0))

# Create on TPU
tpu_tensor = Tensor.zeros((50, 50), device=Device(DeviceType.TPU, 0))

Moving Between Devices

from neurenix.tensor import Tensor
from neurenix.device import Device, DeviceType

# Create on CPU
tensor = Tensor.randn((1000, 1000))

# Move to GPU (creates new tensor)
gpu_tensor = tensor.to(Device(DeviceType.CUDA, 0))

# Move to different GPU
gpu1_tensor = gpu_tensor.to(Device(DeviceType.CUDA, 1))

# Move back to CPU
cpu_tensor = gpu1_tensor.to(Device(DeviceType.CPU))

Device-Specific Features

CUDA Devices

from neurenix.device import Device, DeviceType, get_device_count
from neurenix.device_manager import DeviceManager

# Check CUDA availability
if get_device_count(DeviceType.CUDA) > 0:
    device = Device(DeviceType.CUDA, 0)
    
    # Get memory statistics
    manager = DeviceManager()
    stats = manager.get_memory_stats(device)
    
    print(f"Total GPU memory: {stats['total'] / 1e9:.2f} GB")
    print(f"Available: {stats['available'] / 1e9:.2f} GB")
    print(f"Used: {stats['used'] / 1e9:.2f} GB")

WebGPU for Browser Deployment

from neurenix.device import Device, DeviceType, get_device_count

# Check WebGPU availability (in WebAssembly context)
if get_device_count(DeviceType.WEBGPU) > 0:
    webgpu = Device(DeviceType.WEBGPU, 0)
    
    # Run inference in browser
    from neurenix.tensor import Tensor
    
    model = load_model()
    model.to(webgpu)
    
    input_tensor = Tensor(input_data, device=webgpu)
    output = model(input_tensor)
WebGPU support is automatically detected when running in a WebAssembly environment with GPU access.

TPU Devices

from neurenix.device import Device, DeviceType, get_device_count
from neurenix.tensor import Tensor

# TPUs are optimized for large batch inference
if get_device_count(DeviceType.TPU) > 0:
    tpu = Device(DeviceType.TPU, 0)
    
    # Large batch processing
    batch = Tensor.randn((1024, 512), device=tpu)
    model = MyModel().to(tpu)
    
    predictions = model(batch)

Device Comparison

from neurenix.device import Device, DeviceType

cuda_0 = Device(DeviceType.CUDA, 0)
cuda_1 = Device(DeviceType.CUDA, 1)
cpu = Device(DeviceType.CPU)

# Device equality
print(cuda_0 == cuda_0)  # True
print(cuda_0 == cuda_1)  # False (different index)
print(cuda_0 == cpu)     # False (different type)

# Devices are hashable
device_set = {cuda_0, cuda_1, cpu}
device_dict = {cuda_0: "GPU 0", cuda_1: "GPU 1"}

Multi-Device Training

Distribute workload across multiple devices:
from neurenix.device import Device, DeviceType, get_device_count
from neurenix.tensor import Tensor

# Get all available CUDA devices
num_gpus = get_device_count(DeviceType.CUDA)
gpus = [Device(DeviceType.CUDA, i) for i in range(num_gpus)]

if len(gpus) > 1:
    # Split batch across GPUs
    batch_size_per_gpu = total_batch_size // len(gpus)
    
    models = [model.clone().to(gpu) for gpu in gpus]
    
    for i, (gpu, model) in enumerate(zip(gpus, models)):
        start_idx = i * batch_size_per_gpu
        end_idx = start_idx + batch_size_per_gpu
        
        batch = Tensor(data[start_idx:end_idx], device=gpu)
        output = model(batch)

Best Practices

Check Availability

Always check device availability before use with get_device_count()

Consistent Device Placement

Keep tensors and models on the same device to avoid transfer overhead

Use Genesis

Let the Genesis system handle device selection for optimal performance

Profile Memory

Monitor GPU memory usage with DeviceManager.get_memory_stats()

Common Patterns

Automatic Fallback

from neurenix.device import Device, DeviceType, get_device_count

# Try GPU, fallback to CPU
if get_device_count(DeviceType.CUDA) > 0:
    device = Device(DeviceType.CUDA, 0)
elif get_device_count(DeviceType.ROCM) > 0:
    device = Device(DeviceType.ROCM, 0)
else:
    device = Device(DeviceType.CPU)

print(f"Using device: {device}")

Device-Agnostic Code

from neurenix.device_manager import Genesis
from neurenix.tensor import Tensor

# Let Genesis select the best device
genesis = Genesis()
device = genesis.select_device(workload_type="training")

# All subsequent operations use the selected device
model = MyModel().to(device)
data = Tensor(training_data, device=device)
targets = Tensor(labels, device=device)

output = model(data)
loss = loss_fn(output, targets)

API Reference

Class/FunctionDescription
Device(type, index)Create a device instance
DeviceTypeEnum of supported device types
get_device(str)Parse device from string
get_device_count(type)Count devices of a specific type
get_available_devices()List all available devices
Device.device_count()Get total number of devices

Device Properties

PropertyTypeDescription
.typeDeviceTypeThe device type
.indexintThe device index
.namestrHuman-readable device name