Using rocSOLVER¶
Once installed, rocSOLVER can be used just like any other library with a C API. The header file will need to be included in the user code, and both the rocBLAS and rocSOLVER shared libraries will become link-time and run-time dependencies for the user application.
QR factorization of a single matrix¶
The following code snippet uses rocSOLVER to compute the QR factorization of a general m-by-n real matrix in double precision. For a full description of the used rocSOLVER routine, see the API documentation here: rocsolver_<type>geqrf().
/////////////////////////////
// example.cpp source code //
/////////////////////////////
#include <algorithm> // for std::min
#include <stdio.h> // for size_t, printf
#include <vector>
#include <hip/hip_runtime_api.h> // for hip functions
#include <rocsolver.h> // for all the rocsolver C interfaces and type declarations
// Example: Compute the QR Factorization of a matrix on the GPU
void get_example_matrix(std::vector<double>& hA,
rocblas_int& M,
rocblas_int& N,
rocblas_int& lda) {
// a *very* small example input; not a very efficient use of the API
const double A[3][3] = { { 12, -51, 4},
{ 6, 167, -68},
{ -4, 24, -41} };
M = 3;
N = 3;
lda = 3;
// note: rocsolver matrices must be stored in column major format,
// i.e. entry (i,j) should be accessed by hA[i + j*lda]
hA.resize(size_t(lda) * N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
// copy A (2D array) into hA (1D array, column-major)
hA[i + j*lda] = A[i][j];
}
}
}
// We use rocsolver_dgeqrf to factor a real M-by-N matrix, A.
// See https://rocsolver.readthedocs.io/en/latest/userguide_api.html#_CPPv416rocsolver_dgeqrf14rocblas_handleK11rocblas_intK11rocblas_intPdK11rocblas_intPd
// and https://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html
int main() {
rocblas_int M; // rows
rocblas_int N; // cols
rocblas_int lda; // leading dimension
std::vector<double> hA; // input matrix on CPU
get_example_matrix(hA, M, N, lda);
// let's print the input matrix, just to see it
printf("A = [\n");
for (size_t i = 0; i < M; ++i) {
printf(" ");
for (size_t j = 0; j < N; ++j) {
printf("% .3f ", hA[i + j*lda]);
}
printf(";\n");
}
printf("]\n");
// initialization
rocblas_handle handle;
rocblas_create_handle(&handle);
// calculate the sizes of our arrays
size_t size_A = size_t(lda) * N; // count of elements in matrix A
size_t size_piv = size_t(std::min(M, N)); // count of Householder scalars
// allocate memory on GPU
double *dA, *dIpiv;
hipMalloc(&dA, sizeof(double)*size_A);
hipMalloc(&dIpiv, sizeof(double)*size_piv);
// copy data to GPU
hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice);
// compute the QR factorization on the GPU
rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv);
// copy the results back to CPU
std::vector<double> hIpiv(size_piv); // array for householder scalars on CPU
hipMemcpy(hA.data(), dA, sizeof(double)*size_A, hipMemcpyDeviceToHost);
hipMemcpy(hIpiv.data(), dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost);
// the results are now in hA and hIpiv
// we can print some of the results if we want to see them
printf("R = [\n");
for (size_t i = 0; i < M; ++i) {
printf(" ");
for (size_t j = 0; j < N; ++j) {
printf("% .3f ", (i <= j) ? hA[i + j*lda] : 0);
}
printf(";\n");
}
printf("]\n");
// clean up
hipFree(dA);
hipFree(dIpiv);
rocblas_destroy_handle(handle);
}
The exact command used to compile the example above may vary depending on the system environment, but here is a typical example:
/opt/rocm/bin/hipcc -I/opt/rocm/include -c example.cpp
/opt/rocm/bin/hipcc -o example -L/opt/rocm/lib -lrocsolver -lrocblas example.o
QR factorization of a batch of matrices¶
One of the advantages of using GPUs is the ability to execute in parallel many operations of the same type but on different data sets. Based on this idea, rocSOLVER and rocBLAS provide a batch version of most of their routines. These batch versions allow the user to execute the same operation on a set of different matrices and/or vectors with a single library call. For more details on the approach to batch functionality followed in rocSOLVER, see Batch rocSOLVER.
Strided_batched version¶
The following code snippet uses rocSOLVER to compute the QR factorization of a series of general m-by-n real matrices in double precision. The matrices must be stored in contiguous memory locations on the GPU, and are accessed by a pointer to the first matrix and a stride value that gives the separation between one matrix and the next one. For a full description of the used rocSOLVER routine, see the API documentation here: rocsolver_<type>geqrf_strided_batched().
/////////////////////////////////////
// example_strided.cpp source code //
/////////////////////////////////////
#include <iostream>
#include <stdlib.h>
#include <vector>
#include <rocsolver.h> // this includes all the rocsolver C interfaces and type declarations
using namespace std;
int main() {
rocblas_int M;
rocblas_int N;
rocblas_int lda;
rocblas_int batch_count; // number of matrices to factorize in the batch
// initialize batch_count, M, N and lda with desired values
// here===>>
rocblas_handle handle;
rocblas_create_handle(&handle); // this creates the rocblas handle
rocblas_int strideA = lda*N; // separation between two matrices in memory
size_t size_A = size_t(strideA)*batch_count; // size of the array that holds the matrices
rocblas_int strideP = min(M,N); // separation between two sets of Householder scalars
size_t size_piv = size_t(strideP)*batch_count; // size of the array that will have the Householder scalars
vector<double> hA(size_A); // creates array for matrices in CPU
vector<double> hIpiv(size_piv); // creates array for householder scalars in CPU
double *dA, *dIpiv;
hipMalloc(&dA,sizeof(double)*size_A); // allocates memory for matrices in GPU
hipMalloc(&dIpiv,sizeof(double)*size_piv); // allocates memory for scalars in GPU
// initialize all matrices (array hA) with input data
// here===>>
// ( matrices must be stored in column major format, i.e. entry (i,j)
// of the k-th matrix in the batch should be accessed by hA[i + j*lda + k*strideA] )
hipMemcpy(dA,hA.data(),sizeof(double)*size_A,hipMemcpyHostToDevice); // copy data to GPU
rocsolver_dgeqrf_strided_batched(handle, M, N, dA,
lda, strideA, dIpiv, strideP, batch_count); // compute the QR factorization on the GPU
hipMemcpy(hA.data(),dA,sizeof(double)*size_A,hipMemcpyDeviceToHost); // copy the results back to CPU
hipMemcpy(hIpiv.data(),dIpiv,sizeof(double)*size_piv,hipMemcpyDeviceToHost);
// do something with the results in hA and hIpiv
// here===>>
hipFree(dA); // de-allocate GPU memory
hipFree(dIpiv);
rocblas_destroy_handle(handle); // destroy handle
return 0;
}
Batched version¶
The following code snippet uses rocSOLVER to compute the QR factorization of a series of general m-by-n real matrices in double precision. The matrices do not need to be in contiguous memory locations on the GPU, and will be accessed by an array of pointers. For a full description of the used rocSOLVER routine, see the API documentation here: rocsolver_<type>geqrf_batched().
/////////////////////////////////////
// example_batched.cpp source code //
/////////////////////////////////////
#include <iostream>
#include <stdlib.h>
#include <vector>
#include <rocsolver.h> // this includes all the rocsolver C interfaces and type declarations
using namespace std;
int main() {
rocblas_int M;
rocblas_int N;
rocblas_int lda;
rocblas_int batch_count; // number of matrices to factorize in the batch
// initialize batch_count, M, N and lda with desired values
// here===>>
rocblas_handle handle;
rocblas_create_handle(&handle); // this creates the rocblas handle
size_t size_A = size_t(lda)*N; // size of the array that holds one matrix
rocblas_int strideP = min(M,N); // separation between two sets of Householder scalars
size_t size_piv = size_t(strideP)*batch_count; // size of the array that will have the Householder scalars
vector<double> hIpiv(size_piv); // creates array for householder scalars in CPU
vector<double> hA[batch_count]; // creates array on the CPU of pointers to the CPU
for(int b=0; b < batch_count; ++b) {
hA[b] = vector<double>(size_A); // each pointer will point to a matrix of the batch on the CPU
}
double* A[batch_count]; // creates array on the CPU of pointers to the GPU
for (int b = 0; b < batch_count; ++b)
hipMalloc(&A[b], sizeof(double)*size_A); // each pointer will point to a matrix of the batch on the GPU
double **dA, *dIpiv;
hipMalloc(&dA,sizeof(double*) * size_A); // array on the GPU of pointers on the GPU
hipMalloc(&dIpiv,sizeof(double)*size_piv); // allocates memory for scalars in GPU
// initialize all matrices (arrays hA[k]) with input data
// here===>>
// ( matrices must be stored in column major format, i.e. entry (i,j)
// of the k-th matrix in the batch should be accessed by hA[k][i + j*lda] )
for(int b=0; b < batch_count; ++b)
hipMemcpy(A[b], hA[b].data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // copy data to GPU
hipMemcpy(dA, A, sizeof(double*) * batch_count, hipMemcpyHostToDevice); // copy pointers to GPU
rocsolver_dgeqrf_batched(handle, M, N, dA,
lda, dIpiv, strideP, batch_count); // compute the QR factorization on the GPU
for(int b=0;b<batch_count;b++)
hipMemcpy(hA[b].data(), A[b], sizeof(double) * size_A, hipMemcpyDeviceToHost); // copy the results back
hipMemcpy(hIpiv.data(),dIpiv,sizeof(double)*size_piv,hipMemcpyDeviceToHost);
// do something with the results in hA and hIpiv
// here===>>
for(int b=0;b<batch_count;++b)
hipFree(A[b]);
hipFree(dA); // de-allocate GPU memory
hipFree(dIpiv);
rocblas_destroy_handle(handle); // destroy handle
return 0;
}