12 releases
0.1.11 | Jan 5, 2022 |
---|---|
0.1.10 | Jan 4, 2022 |
#867 in Machine learning
22 downloads per month
115KB
3K
SLoC
tensorgraph-math
Mathematics primitives used by tensorgraph. Builds upon tensorgraph-sys to support many BLAS backends and devices.
Basic example using openblas:
Enable features in the Cargo.toml:
tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas"] }
use tensorgraph_math::{tensor::Tensor, sys::View};
// 0 1
// A = 2 3
// 4 5
// B = 0 1
// 2 3
// column major (read each column first)
let a = [0., 2., 4., 1., 3., 5.];
let b = [0., 2., 1., 3.];
let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols
// 2 3
// C = AB = 6 11
// 10 19
let c = a.matmul(b.view());
assert_eq!(c.into_inner().into_std(), [2., 6., 10., 3., 11., 19.]);
Intermediate example using cublas globals and openblas together:
Enable features in the Cargo.toml:
tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas", "cublas"] }
use tensorgraph_math::{
blas::{DefaultBLASContext, cublas::CublasContext, BLAS},
sys::{
device::{DefaultDeviceAllocator, cuda::{Context, Cuda, Stream}, cpu::Cpu},
DefaultVec, View,
},
tensor::Tensor,
};
fn main() {
// init cuda context
let cuda_ctx = Context::quick_init().unwrap();
// create cuda stream and configure it as the global
let stream = Stream::new(&cuda_ctx).unwrap();
let _handle = stream.as_global();
// create cublas context, with the provided stream, and configure it as the global
let cublas_ctx = CublasContext::new();
let _handle = cublas_ctx.with_stream(Some(&stream)).as_global();
// cublas is the default BLAS implementation for CUDA when the feature is enabled
run::<Cuda>();
// openblas is the default BLAS implemenetation for CPU when the feature is enabled
run::<Cpu>();
}
/// Generic code that runs on the specified device
/// using that devices default allocator and BLAS provider
fn run<D: DefaultDeviceAllocator + DefaultBLASContext>()
where
f32: BLAS<D::Context>,
{
// 0 1
// A = 2 3
// 4 5
// B = 0 1
// 2 3
// column major (read each column first)
let a = DefaultVec::<f32, D>::copy_from_host(&[0., 2., 4., 1., 3., 5.]);
let b = DefaultVec::<f32, D>::copy_from_host(&[0., 2., 1., 3.]);
let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols
// 2 3
// C = AB = 6 11
// 10 19
let c = a.matmul(b.view());
let mut out = [0.; 6];
c.into_inner().copy_to_host(&mut out);
assert_eq!(out, [2., 6., 10., 3., 11., 19.]);
}
Advanced example using openblas and cublas by passing blas contexts and allocators:
Enable features in the Cargo.toml:
tensorgraph-math = { version = "LATEST_VERSION", features = ["openblas", "cublas"] }
#![feature(allocator_api)]
use std::{alloc::Global, ops::Deref};
use tensorgraph_math::{
blas::{BLASContext, cublas::{CublasContext}, BLAS},
sys::{
device::{cuda::{Context, Cuda, Stream}, cpu::Cpu, Device, DeviceAllocator},
Vec, View,
},
tensor::Tensor,
};
fn main() {
// init cuda context
let cuda_ctx = Context::quick_init().unwrap();
// create cuda stream
let stream = Stream::new(&cuda_ctx).unwrap();
// create cublas context, with the provided stream
let cublas_ctx = CublasContext::new();
let cublas_ctx = cublas_ctx.with_stream(Some(&stream));
// run using the CUDA stream as the allocator, and cublas
// as the BLAS provider
run(cublas_ctx, stream.deref());
// run using the CPU default BLAS and Global allocator
run((), Global);
}
fn run<C: BLASContext, A: DeviceAllocator<Device = C::Device> + Copy>(ctx: C, alloc: A)
where
f32: BLAS<C>,
{
// 0 1
// A = 2 3
// 4 5
// B = 0 1
// 2 3
// column major (read each column first)
let a = Vec::copy_from_host_in(&[0., 2., 4., 1., 3., 5.], alloc);
let b = Vec::copy_from_host_in(&[0., 2., 1., 3.0_f32], alloc);
let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols
// 2 3
// C = AB = 6 11
// 10 19
let c = a.matmul_into(b.view(), ctx, alloc);
let mut out = [0.; 6];
c.into_inner().copy_to_host(&mut out);
assert_eq!(out, [2., 6., 10., 3., 11., 19.]);
}
Dependencies
~0.1–10MB
~146K SLoC