#blas #cuda #neural-network #machine-learning #numeric

nightly tensorgraph-math

backbone for tensorgraph, providing math primitives

12 releases

0.1.11 Jan 5, 2022
0.1.10 Jan 4, 2022

#1077 in Machine learning

22 downloads per month

Custom license

115KB
3K SLoC

tensorgraph-math

Mathematics primitives used by tensorgraph. Builds upon tensorgraph-sys to support many BLAS backends and devices.

Basic example using openblas:

Enable features in the Cargo.toml:

tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas"] }
use tensorgraph_math::{tensor::Tensor, sys::View};

//     0 1
// A = 2 3
//     4 5

// B = 0 1
//     2 3

// column major (read each column first)
let a = [0., 2., 4., 1., 3., 5.];
let b = [0., 2., 1., 3.];

let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols

//           2  3
// C = AB =  6 11
//          10 19

let c = a.matmul(b.view());
assert_eq!(c.into_inner().into_std(), [2., 6., 10., 3., 11., 19.]);

Intermediate example using cublas globals and openblas together:

Enable features in the Cargo.toml:

tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas", "cublas"] }
use tensorgraph_math::{
    blas::{DefaultBLASContext, cublas::CublasContext, BLAS},
    sys::{
        device::{DefaultDeviceAllocator, cuda::{Context, Cuda, Stream}, cpu::Cpu},
        DefaultVec, View,
    },
    tensor::Tensor,
};

fn main() {
    // init cuda context
    let cuda_ctx = Context::quick_init().unwrap();

    // create cuda stream and configure it as the global
    let stream = Stream::new(&cuda_ctx).unwrap();
    let _handle = stream.as_global();

    // create cublas context, with the provided stream, and configure it as the global
    let cublas_ctx = CublasContext::new();
    let _handle = cublas_ctx.with_stream(Some(&stream)).as_global();

    // cublas is the default BLAS implementation for CUDA when the feature is enabled
    run::<Cuda>();

    // openblas is the default BLAS implemenetation for CPU when the feature is enabled
    run::<Cpu>();
}

/// Generic code that runs on the specified device
/// using that devices default allocator and BLAS provider
fn run<D: DefaultDeviceAllocator + DefaultBLASContext>()
where
    f32: BLAS<D::Context>,
{
    //     0 1
    // A = 2 3
    //     4 5

    // B = 0 1
    //     2 3

    // column major (read each column first)
    let a = DefaultVec::<f32, D>::copy_from_host(&[0., 2., 4., 1., 3., 5.]);
    let b = DefaultVec::<f32, D>::copy_from_host(&[0., 2., 1., 3.]);

    let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
    let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols

    //           2  3
    // C = AB =  6 11
    //          10 19

    let c = a.matmul(b.view());

    let mut out = [0.; 6];
    c.into_inner().copy_to_host(&mut out);
    assert_eq!(out, [2., 6., 10., 3., 11., 19.]);
}

Advanced example using openblas and cublas by passing blas contexts and allocators:

Enable features in the Cargo.toml:

tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas", "cublas"] }
#![feature(allocator_api)]
use std::{alloc::Global, ops::Deref};
use tensorgraph_math::{
    blas::{BLASContext, cublas::{CublasContext}, BLAS},
    sys::{
        device::{cuda::{Context, Cuda, Stream}, cpu::Cpu, Device, DeviceAllocator},
        Vec, View,
    },
    tensor::Tensor,
};

fn main() {
    // init cuda context
    let cuda_ctx = Context::quick_init().unwrap();

    // create cuda stream
    let stream = Stream::new(&cuda_ctx).unwrap();

    // create cublas context, with the provided stream
    let cublas_ctx = CublasContext::new();
    let cublas_ctx = cublas_ctx.with_stream(Some(&stream));

    // run using the CUDA stream as the allocator, and cublas
    // as the BLAS provider
    run(cublas_ctx, stream.deref());

    // run using the CPU default BLAS and Global allocator
    run((), Global);
}

fn run<C: BLASContext, A: DeviceAllocator<Device = C::Device> + Copy>(ctx: C, alloc: A)
where
    f32: BLAS<C>,
{
    //     0 1
    // A = 2 3
    //     4 5

    // B = 0 1
    //     2 3

    // column major (read each column first)
    let a = Vec::copy_from_host_in(&[0., 2., 4., 1., 3., 5.], alloc);
    let b = Vec::copy_from_host_in(&[0., 2., 1., 3.0_f32], alloc);

    let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
    let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols

    //           2  3
    // C = AB =  6 11
    //          10 19

    let c = a.matmul_into(b.view(), ctx, alloc);

    let mut out = [0.; 6];
    c.into_inner().copy_to_host(&mut out);
    assert_eq!(out, [2., 6., 10., 3., 11., 19.]);
}

Dependencies

~0.1–10MB
~144K SLoC