TIL rocprof -d flag
I work with AMD GPUs and ROCm/HIP a little bit. If I write an MPI program that
also uses HIP such that each MPI task is running on its own GPU, it would useful
for me sometimes to profile each rank. AMD provides a tool called rocprof which is very useful, but by default it
will output a single results.csv
file. If you’re profiling multiple ranks, different ranks
can end up overwriting each other’s results.csv
.
Say you had a silly little program where each MPI task is adding two arrays using a GPU. It might look something like:
#include <stdio.h>
#include <math.h>
#include <mpi.h>
#include <hip/hip_runtime.h>
/* ---------------------------------------------------------------------------------
Macro for checking errors in HIP API calls
----------------------------------------------------------------------------------*/
#define hipErrorCheck(call) \
do{ \
hipError_t hipErr = call; \
if(hipSuccess != hipErr){ \
printf("HIP Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(hipErr)); \
exit(0); \
} \
}while(0)
/* ---------------------------------------------------------------------------------
Vector addition kernel
----------------------------------------------------------------------------------*/
__global__ void add_vectors(double *a, double *b, double *c, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
if(id < n) c[id] = a[id] + b[id];
}
/* ---------------------------------------------------------------------------------
Main program
----------------------------------------------------------------------------------*/
int main(int argc, char *argv[]){
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char name[MPI_MAX_PROCESSOR_NAME];
int resultlength;
MPI_Get_processor_name(name, &resultlength);
if(rank == 0) {
printf("number of ranks: %d\n", size);
}
// Start Total Runtime Timer
double start_time, end_time, elapsed_time;
start_time = MPI_Wtime();
// Array length
long long int N = 256*1024*1024;
size_t buffer_size = N * sizeof(double);
double *A = (double*)malloc(buffer_size);
double *B = (double*)malloc(buffer_size);
double *C = (double*)malloc(buffer_size);
for(int i=0; i<N; i++){
A[i] = 1;
B[i] = 1;
C[i] = 0.0;
}
double *d_A, *d_B, *d_C;
hipErrorCheck( hipMalloc(&d_A, buffer_size) );
hipErrorCheck( hipMalloc(&d_B, buffer_size) );
hipErrorCheck( hipMalloc(&d_C, buffer_size) );
hipErrorCheck( hipMemcpy(d_A, A, buffer_size, hipMemcpyHostToDevice) );
hipErrorCheck( hipMemcpy(d_B, B, buffer_size, hipMemcpyHostToDevice) );
hipEvent_t start, end;
hipErrorCheck( hipEventCreate(&start) );
hipErrorCheck( hipEventCreate(&end) );
// Set execution configuration parameters
int thr_per_blk = 256;
int blk_in_grid = ceil( float(N) / thr_per_blk );
hipErrorCheck( hipEventRecord(start, NULL) );
add_vectors<<<dim3(blk_in_grid), dim3(thr_per_blk), 0, hipStreamDefault>>>(d_A, d_B, d_C, N);
hipErrorCheck( hipEventRecord(end, NULL) );
hipErrorCheck( hipEventSynchronize(end) );
float milliseconds = 0.0;
hipErrorCheck( hipEventElapsedTime(&milliseconds, start, end) );
hipErrorCheck( hipMemcpy(C, d_C, buffer_size, hipMemcpyDeviceToHost) );
float max_gpu_time;
MPI_Reduce(&milliseconds, &max_gpu_time, 1, MPI_FLOAT, MPI_MAX, 0, MPI_COMM_WORLD);
double sum = 0.0;
for(int i=0; i<N; i++){
sum = sum + C[i];
}
double result = sum / (double)(2*N);
if(result != 1){
printf("In rank %d: Test failed!\n", rank);
exit(1);
}
hipErrorCheck( hipFree(d_A) );
hipErrorCheck( hipFree(d_B) );
hipErrorCheck( hipFree(d_C) );
free(A);
free(B);
free(C);
end_time = MPI_Wtime();
elapsed_time = end_time - start_time;
double total_time_max;
MPI_Reduce(&elapsed_time, &total_time_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(rank == 0) {
printf("Result = %.16f\n", result);
printf("Array buffer size = %zu\n", buffer_size);
printf("Max GPU time (s) = %.6f\n", max_gpu_time / 1000.0);
printf("Max MPI time (s) = %.6f\n", total_time_max / 1000.0);
}
MPI_Finalize();
return 0;
}
And you run on a system with a Slurm scheduler like below which starts 4 MPI tasks:
srun -N1 -n4 --gpus-per-task=1 rocprof ./vecAdd
This would create four instances of rocprof that would clobber each other’s results.csv
output.
A way around this is by using the -d
flag to specify a directory. So instead your srun command could look like
srun -N1 -n4 --gpus-per-task=1 rocprof -d results ./vecAdd
So instead each rank will generate a subdirectory in the results
directory
that will be created in your current directory. If you look at your results
directory you will see
$ ls results
rpl_data_230815_193349_122832 rpl_data_230815_193349_122833 rpl_data_230815_193349_76989 rpl_data_230815_193349_76990
where each rank has its own directory. You can find the profiling results for
one of the ranks for example in
rpl_data_230815_193349_76990/input_results_230815_193349/results.txt
There’s probably a more streamlined way to get each rank to print its profiling
results instead having to dig through the results directory (and this could
potentially have problems when you’re using MPI across multiple nodes and the
pids of two MPI ranks on separate nodes happen to be the same). But at least
learning about the -d
flag is a start.