There's a number of ways of getting (from the command line) the bandwidth over the whole application, but it sounds like there are a number of kernels you'd like to look at individually. In that case, wrapping parts of your code with PAPI calls is a perfectly sensible way to go.
You can use PAPI event counters on your system (papi_avail
) to find the total number of load/store instructions, and if you know the sizes of your load/stores you can get the memory bandwidth. Alternately, you can count for hits in your caches, and multiply by the line sizes, to infer the actual amount of data transferred across the system. There is documentation in various places on the PAPI wiki, e.g. here for the high-level interface, and here's some useful formula for helpful derived quantities.
Here's a coded-up simple example, doing a matrix-vector multiplication the sensible way and the cache-unfriendly transposed way. Note that calling PAPI_read_counters resets the counters, which is what we want here.
#include <stdio.h>
#include <stdlib.h>
typedef char * caddr_t;
#include <papi.h>
#include <sys/time.h>
int init(float ***a, float **x, float **y, int size);
void report_results(char *tname, long_long *values, const int n, double wtime);
void sensible_matvec(float **a, float *x, float *y, int size);
void wrong_order_matvec(float **a, float *x, float *y, int size);
void tick(struct timeval *t);
double tock(struct timeval *t);
#define NUM_EVENTS 3
int main(int argc, char **argv) {
const int matsize = 4096;
float **a, *x, *y;
init(&a, &x, &y, matsize);
int events[NUM_EVENTS] = {PAPI_L1_DCM, PAPI_LST_INS, PAPI_FP_INS};
long_long values[NUM_EVENTS];
double walltime;
struct timeval t;
if (PAPI_start_counters(events, NUM_EVENTS) != PAPI_OK) {
fprintf(stderr, "Error starting PAPI counters; aborting\n");
exit(1);
}
tick(&t);
sensible_matvec(a, x, y, matsize);
PAPI_read_counters(values, NUM_EVENTS);
walltime = tock(&t);
report_results("Sensible", values, NUM_EVENTS, walltime);
tick(&t);
wrong_order_matvec(a, x, y, matsize);
PAPI_stop_counters(values, NUM_EVENTS);
walltime = tock(&t);
report_results("Wrong order", values, NUM_EVENTS, walltime);
return 0;
}
void report_results(char *tname, long_long *values, const int n, double wtime) {
long_long total_mem = values[1];
long_long total_flops = values[2];
long_long l1misses = values[0];
printf("Test %s: time elapsed = %f, memory accesses = %lld, flop = %lld\n",
tname, wtime, total_mem, total_flops);
printf("\tMemory bandwidth (MB/sec) = %f\n", 1.0*total_mem*sizeof(float)/(wtime*1024*1024));
printf("\tL1 cache miss rate = %f\n", 1.0*l1misses/total_mem);
printf("\tMFLOPS = %lf\n\n", 1.0*total_flops/(wtime*1024*1024));
}
int alloc2d(float ***a, int n);
int free2d(float ***a, int n);
int alloc1d(float **x, int n);
int free1d(float **x, int n);
int init(float ***a, float **x, float **y, int size) {
if (alloc2d(a,size))
return -2;
if (alloc1d(x,size)) {
free2d(a,size);
return -2;
}
if (alloc1d(y,size)) {
free2d(a,size);
free1d(x,size);
return -3;
}
for (int i=0; i<size; i++) {
(*x)[i] = (float)i;
(*y)[i] = 0.;
}
for (int i=0; i<size; i++) {
for (int j=0; j<size; j++) {
(*a)[i][j] = i;
}
}
return 0;
}
void sensible_matvec(float **a, float *x, float *y, int size) {
for (int i=0; i<size; i++) {
for (int j=0; j<size; j++) {
y[i] += a[i][j]*x[j];
}
}
}
void wrong_order_matvec(float **a, float *x, float *y, int size) {
for (int j=0; j<size; j++) {
for (int i=0; i<size; i++) {
y[i] += a[i][j]*x[j];
}
}
}
void tick(struct timeval *t) {
gettimeofday(t, NULL);
}
double tock(struct timeval *t) {
struct timeval now;
gettimeofday(&now, NULL);
return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);
}
void freeall(float ***a, float **x, float **y, int size) {
free2d(a, size);
free1d(x, size);
free1d(y, size);
return;
}
int alloc2d(float ***a, int n) {
float *data = (float *)malloc(n*n*sizeof(float));
if (data == NULL) return -1;
*a = (float **)malloc(n*sizeof(float *));
if (*a == NULL) {free(data); return -1;};
for (int i=0; i<n; i++)
(*a)[i] = &(data[i*n]);
return 0;
}
int free2d(float ***a, int n) {
free (&((*a)[0][0]));
free(*a);
return 0;
}
int alloc1d(float **a, int n) {
*a = (float *)malloc(n*sizeof(float));
if (*a == NULL) return -1;
return 0;
}
int free1d(float **a, int n) {
free(*a);
return 0;
}
Running gives:
$ gcc -o papi-test papi-test.c -I${PAPI_INC_DIR} -L${PAPI_LIB_DIR} -lpapi -Wall -std=c99
$ ./papi-test
Test Sensible: time elapsed = 0.121877, memory accesses = 302020775, flop = 33580481
Memory bandwidth (MB/sec) = 9453.119330
L1 cache miss rate = 0.003921
MFLOPS = 262.763624
Test Wrong order: time elapsed = 0.537639, memory accesses = 302026751, flop = 39629352
Memory bandwidth (MB/sec) = 2142.963254
L1 cache miss rate = 0.094045
MFLOPS = 70.295301